Spaces:

anoopreddyyeddula
/

Automated-Insurance-Claim-Validation-System

Sleeping

anoopreddyyeddula commited on Apr 27, 2025

Commit

bef72f9

1 Parent(s): 9b0cc99

Add robust file validation and error handling

- Add file type and size validation
- Improve OCR preprocessing and error handling
- Enhance Gradio interface with better descriptions
- Add detailed logging for debugging
- Update documentation with file size limits

Files changed (1) hide show

app.py +123 -24

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import PyPDF2
 import pandas as pd
 import io
 from datetime import datetime
 # Initialize the OCR reader
 reader = easyocr.Reader(['en'])
@@ -21,8 +22,34 @@ doc_classifier = pipeline("image-classification", model="microsoft/resnet-50")
 def convert_pdf_to_images(pdf_file):
     """Convert PDF to list of images"""
     try:
-        return pdf2image.convert_from_bytes(pdf_file.read())
     except Exception as e:
         return None
 def process_single_image(image):
@@ -72,6 +99,12 @@ def ocr_function(image):
         if isinstance(image, Image.Image):
             image = np.array(image)
         # Perform OCR
         results = reader.readtext(image)
@@ -83,6 +116,7 @@ def ocr_function(image):
         return text.strip()
     except Exception as e:
         return f"OCR Error: {str(e)}"
 def validate_text(text):
@@ -122,34 +156,86 @@ def validate_image(image):
     except Exception as e:
         return False, f"Image validation error: {str(e)}"
 def process_claim(file):
     try:
         if file is None:
             return "No file provided", "N/A", "N/A", None
-        # Check if file is PDF
-        if hasattr(file, 'name') and file.name.lower().endswith('.pdf'):
             images = convert_pdf_to_images(file)
             if not images:
-                return "Failed to process PDF", "Error", "Error", None
         else:
-            # Handle single image
-            if isinstance(file, np.ndarray):
-                images = [Image.fromarray(file)]
-            else:
-                images = [file]
-        # Process each page
         results = []
         for idx, img in enumerate(images):
-            result = process_single_image(img)
-            result['page'] = idx + 1
-            results.append(result)
         # Generate report
-        summary, excel_file = generate_report(results)
-        # Return results for display
         return (
             "\n\n=== Page Break ===\n\n".join([r['text'] for r in results]),
             "\n".join([f"Page {r['page']}: {r['validation']} ({r['validation_confidence']:.2%})" for r in results]),
@@ -158,22 +244,35 @@ def process_claim(file):
         )
     except Exception as e:
         return f"Processing error: {str(e)}", "Error", "Error", None
-# Create Gradio interface
 iface = gr.Interface(
     fn=process_claim,
-    inputs=gr.File(label="Upload Insurance Document (Image or PDF)"),
     outputs=[
-        gr.Textbox(label="Extracted Text"),
-        gr.Textbox(label="Text Validation"),
-        gr.Textbox(label="Document Classification"),
-        gr.File(label="Download Report")
     ],
     title="Insurance Claim Validation System",
-    description="Upload an insurance claim document (PDF or image) to validate and classify it.",
-    # Remove the examples for now
-    theme=gr.themes.Soft()
 )
 if __name__ == "__main__":

 import pandas as pd
 import io
 from datetime import datetime
+import cv2
 # Initialize the OCR reader
 reader = easyocr.Reader(['en'])
 def convert_pdf_to_images(pdf_file):
     """Convert PDF to list of images"""
     try:
+        # Save PDF content to a temporary file
+        pdf_content = pdf_file.read()
+        pdf_buffer = io.BytesIO(pdf_content)
+        # Check if PDF is valid
+        try:
+            pdf_reader = PyPDF2.PdfReader(pdf_buffer)
+            if len(pdf_reader.pages) == 0:
+                raise ValueError("PDF has no pages")
+        except Exception as e:
+            raise ValueError(f"Invalid PDF file: {str(e)}")
+        # Reset buffer position
+        pdf_buffer.seek(0)
+        # Convert to images
+        images = pdf2image.convert_from_bytes(
+            pdf_buffer.read(),
+            dpi=300,  # Increase DPI for better quality
+            fmt='PNG'
+        )
+        if not images:
+            raise ValueError("No images extracted from PDF")
+        return images
     except Exception as e:
+        print(f"PDF conversion error: {str(e)}")  # Debug logging
         return None
 def process_single_image(image):
         if isinstance(image, Image.Image):
             image = np.array(image)
+        # Image preprocessing
+        if len(image.shape) == 2:  # Convert grayscale to RGB
+            image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+        elif len(image.shape) == 3 and image.shape[2] == 4:  # Convert RGBA to RGB
+            image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
         # Perform OCR
         results = reader.readtext(image)
         return text.strip()
     except Exception as e:
+        print(f"OCR Error: {str(e)}")  # Debug logging
         return f"OCR Error: {str(e)}"
 def validate_text(text):
     except Exception as e:
         return False, f"Image validation error: {str(e)}"
+def validate_file_type(file):
+    """Validate file type and size"""
+    try:
+        if not hasattr(file, 'name'):
+            return False, "Invalid file object"
+        # Get file extension
+        file_ext = file.name.lower().split('.')[-1]
+        # Check allowed extensions
+        allowed_extensions = {'pdf', 'png', 'jpg', 'jpeg', 'tiff'}
+        if file_ext not in allowed_extensions:
+            return False, f"Unsupported file type. Allowed types: {', '.join(allowed_extensions)}"
+        # Check file size (max 10MB)
+        MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB in bytes
+        file.seek(0, 2)  # Seek to end of file
+        file_size = file.tell()
+        file.seek(0)  # Reset file pointer
+        if file_size > MAX_FILE_SIZE:
+            return False, "File too large. Maximum size: 10MB"
+        return True, None
+    except Exception as e:
+        return False, f"File validation error: {str(e)}"
 def process_claim(file):
     try:
         if file is None:
             return "No file provided", "N/A", "N/A", None
+        # Validate file type and size
+        is_valid, error_message = validate_file_type(file)
+        if not is_valid:
+            return error_message, "Error", "Error", None
+        print(f"Processing file: {file.name}")
+        print(f"File type: {type(file)}")
+        # Process PDF
+        if file.name.lower().endswith('.pdf'):
             images = convert_pdf_to_images(file)
             if not images:
+                return "Failed to convert PDF to images. Please check if the PDF is valid.", "Error", "Error", None
         else:
+            # Process image
+            try:
+                img = Image.open(file)
+                images = [img]
+            except Exception as e:
+                return f"Image processing error: {str(e)}", "Error", "Error", None
+        # Process each page/image
         results = []
         for idx, img in enumerate(images):
+            try:
+                # Validate image
+                valid, validated_img = validate_image(img)
+                if not valid:
+                    return f"Invalid image on page {idx + 1}: {validated_img}", "Error", "Error", None
+                # Process image
+                result = process_single_image(validated_img)
+                result['page'] = idx + 1
+                results.append(result)
+            except Exception as e:
+                return f"Error processing page {idx + 1}: {str(e)}", "Error", "Error", None
+        if not results:
+            return "No valid results obtained from processing", "Error", "Error", None
         # Generate report
+        try:
+            summary, excel_file = generate_report(results)
+        except Exception as e:
+            return f"Error generating report: {str(e)}", "Error", "Error", None
+        # Return results
         return (
             "\n\n=== Page Break ===\n\n".join([r['text'] for r in results]),
             "\n".join([f"Page {r['page']}: {r['validation']} ({r['validation_confidence']:.2%})" for r in results]),
         )
     except Exception as e:
+        print(f"Error in process_claim: {str(e)}")
         return f"Processing error: {str(e)}", "Error", "Error", None
+# Update the Gradio interface with better descriptions and examples
 iface = gr.Interface(
     fn=process_claim,
+    inputs=[
+        gr.File(
+            label="Upload Insurance Document",
+            type="file",
+            file_types=['.pdf', '.png', '.jpg', '.jpeg', '.tiff'],
+            description="Supported formats: PDF, PNG, JPG, JPEG, TIFF (Max size: 10MB)"
+        )
+    ],
     outputs=[
+        gr.Textbox(label="Extracted Text", lines=10),
+        gr.Textbox(label="Text Validation Results", lines=5),
+        gr.Textbox(label="Document Classification Results", lines=5),
+        gr.File(label="Download Report (Excel)")
     ],
     title="Insurance Claim Validation System",
+    description="""
+    Upload an insurance claim document to:
+    1. Extract and validate text content
+    2. Classify document type
+    3. Generate detailed analysis report
+    """,
+    theme=gr.themes.Soft(),
+    allow_flagging="never"
 )
 if __name__ == "__main__":