Spaces:

anoopreddyyeddula
/

Automated-Insurance-Claim-Validation-System

Sleeping

App Files Files Community

anoopreddyyeddula commited on Apr 27, 2025

Commit

df33ba6

2 Parent(s): 75d7def db14a92

fix: resolve merge conflicts and update UI

Browse files

Files changed (1) hide show

app.py +67 -197

app.py CHANGED Viewed

@@ -5,23 +5,31 @@ from transformers import pipeline
 import gradio as gr
 import pdf2image
 import PyPDF2
-import pandas as pd
 import io
-from datetime import datetime
 import cv2
 # Initialize the OCR reader
 reader = easyocr.Reader(['en'])
-# Use text classification model
 text_classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
-# Use document classification model - using a compatible model
 doc_classifier = pipeline("image-classification", model="microsoft/resnet-50")
 def convert_pdf_to_images(pdf_file):
-    """Convert PDF to list of images"""
     try:
         # Save PDF content to a temporary file
         pdf_content = pdf_file.read()
         pdf_buffer = io.BytesIO(pdf_content)
@@ -38,218 +46,80 @@ def convert_pdf_to_images(pdf_file):
         pdf_buffer.seek(0)
         # Convert to images
-        images = pdf2image.convert_from_bytes(
-            pdf_buffer.read(),
-            dpi=300,  # Increase DPI for better quality
-            fmt='PNG'
-        )
         if not images:
             raise ValueError("No images extracted from PDF")
         return images
     except Exception as e:
-        print(f"PDF conversion error: {str(e)}")  # Debug logging
         return None
 def process_single_image(image):
-    """Process a single image and return results"""
-    extracted_text = ocr_function(image)
-    validation_result = text_classifier(extracted_text[:512])
-    sentiment = validation_result[0]['label']
-    confidence = validation_result[0]['score']
-    doc_result = doc_classifier(image)
-    doc_type = doc_result[0]['label']
-    doc_confidence = doc_result[0]['score']
-    return {
-        'text': extracted_text,
-        'validation': sentiment,
-        'validation_confidence': confidence,
-        'doc_type': doc_type,
-        'doc_confidence': doc_confidence
-    }
-def generate_report(results):
-    """Generate a formatted report from results"""
-    df = pd.DataFrame(results)
-    # Create Excel buffer
-    excel_buffer = io.BytesIO()
-    with pd.ExcelWriter(excel_buffer) as writer:
-        df.to_excel(writer, index=False)
-    excel_buffer.seek(0)
-    # Create summary text
-    summary = f"""
-    Insurance Claim Processing Report
-    Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
-    Total Pages Processed: {len(results)}
-    Document Types Found: {', '.join(set(df['doc_type']))}
-    """
-    return summary, excel_buffer
-def ocr_function(image):
     try:
-        # Convert PIL image to numpy array
-        if isinstance(image, Image.Image):
-            image = np.array(image)
-        # Image preprocessing
-        if len(image.shape) == 2:  # Convert grayscale to RGB
-            image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
-        elif len(image.shape) == 3 and image.shape[2] == 4:  # Convert RGBA to RGB
-            image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
-        # Perform OCR
-        results = reader.readtext(image)
-        # Extract text
-        text = ' '.join([result[1] for result in results])
-        if not text.strip():
-            return "No text was detected in the image"
-        return text.strip()
-    except Exception as e:
-        print(f"OCR Error: {str(e)}")  # Debug logging
-        return f"OCR Error: {str(e)}"
-def validate_text(text):
-    try:
-        result = text_classifier(text[:512])  # Limit text length to avoid token limit
-        return result[0]['label']
-    except Exception as e:
-        return f"Validation Error: {str(e)}"
-def classify_document(image):
-    try:
-        # Convert PIL image to RGB if needed
-        if image.mode != 'RGB':
-            image = image.convert('RGB')
-        result = doc_classifier(image)
-        return f"Document Type: {result[0]['label']}"
     except Exception as e:
-        return f"Classification Error: {str(e)}"
-def validate_image(image):
-    """Validate image before processing"""
     try:
-        if image is None:
-            return False, "No image provided"
-        if isinstance(image, np.ndarray):
-            image = Image.fromarray(image)
-        if not isinstance(image, Image.Image):
-            return False, "Invalid image format"
-        # Check image size
-        if image.size[0] * image.size[1] > 5000 * 5000:
-            return False, "Image too large. Maximum size: 5000x5000"
-        return True, image
-    except Exception as e:
-        return False, f"Image validation error: {str(e)}"
-def validate_file_type(file):
-    """Validate file type and size"""
-    try:
-        if not hasattr(file, 'name'):
-            return False, "Invalid file object"
-        # Get file extension
-        file_ext = file.name.lower().split('.')[-1]
-        # Check allowed extensions
-        allowed_extensions = {'pdf', 'png', 'jpg', 'jpeg', 'tiff'}
-        if file_ext not in allowed_extensions:
-            return False, f"Unsupported file type. Allowed types: {', '.join(allowed_extensions)}"
-        # Check file size (max 10MB)
-        MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB in bytes
-        file.seek(0, 2)  # Seek to end of file
-        file_size = file.tell()
-        file.seek(0)  # Reset file pointer
-        if file_size > MAX_FILE_SIZE:
-            return False, "File too large. Maximum size: 10MB"
-        return True, None
-    except Exception as e:
-        return False, f"File validation error: {str(e)}"
-def process_claim(file):
-    try:
-        if file is None:
-            return "No file provided", "N/A", "N/A", None
-        # Validate file type and size
-        is_valid, error_message = validate_file_type(file)
-        if not is_valid:
-            return error_message, "Error", "Error", None
-        print(f"Processing file: {file.name}")
-        print(f"File type: {type(file)}")
-        # Process PDF
-        if file.name.lower().endswith('.pdf'):
-            images = convert_pdf_to_images(file)
-            if not images:
-                return "Failed to convert PDF to images. Please check if the PDF is valid.", "Error", "Error", None
-        else:
-            # Process image
-            try:
-                img = Image.open(file)
-                images = [img]
-            except Exception as e:
-                return f"Image processing error: {str(e)}", "Error", "Error", None
-        # Process each page/image
-        results = []
-        for idx, img in enumerate(images):
-            try:
-                # Validate image
-                valid, validated_img = validate_image(img)
-                if not valid:
-                    return f"Invalid image on page {idx + 1}: {validated_img}", "Error", "Error", None
-                # Process image
-                result = process_single_image(validated_img)
-                result['page'] = idx + 1
-                results.append(result)
-            except Exception as e:
-                return f"Error processing page {idx + 1}: {str(e)}", "Error", "Error", None
-        if not results:
-            return "No valid results obtained from processing", "Error", "Error", None
-        # Generate report
-        try:
-            summary, excel_file = generate_report(results)
-        except Exception as e:
-            return f"Error generating report: {str(e)}", "Error", "Error", None
-        # Return results
-        return (
-            "\n\n=== Page Break ===\n\n".join([r['text'] for r in results]),
-            "\n".join([f"Page {r['page']}: {r['validation']} ({r['validation_confidence']:.2%})" for r in results]),
-            "\n".join([f"Page {r['page']}: {r['doc_type']} ({r['doc_confidence']:.2%})" for r in results]),
-            excel_file
-        )
     except Exception as e:
-        print(f"Error in process_claim: {str(e)}")
-        return f"Processing error: {str(e)}", "Error", "Error", None
 # Create the Gradio interface with improved UI and error handling
 iface = gr.Interface(
-    fn=process_claim,
     inputs=[
         gr.File(
             label="Upload Insurance Document",

 import gradio as gr
 import pdf2image
 import PyPDF2
 import io
+import pandas as pd
+import logging
 import cv2
+from datetime import datetime
+import time
+# Set up logging for error handling
+logging.basicConfig(level=logging.DEBUG)
 # Initialize the OCR reader
 reader = easyocr.Reader(['en'])
+# Use text classification model (distilbert for sentiment analysis or text validation)
 text_classifier = pipeline("text-classification", model="distilbert-base-uncased-finetuned-sst-2-english")
+# Use document classification model (ResNet50 as an example)
 doc_classifier = pipeline("image-classification", model="microsoft/resnet-50")
 def convert_pdf_to_images(pdf_file):
+    """Convert PDF to list of images with detailed logging"""
     try:
+        logging.debug("Starting PDF to image conversion...")
+        start_time = time.time()
         # Save PDF content to a temporary file
         pdf_content = pdf_file.read()
         pdf_buffer = io.BytesIO(pdf_content)
         pdf_buffer.seek(0)
         # Convert to images
+        images = pdf2image.convert_from_bytes(pdf_buffer.read(), dpi=300, fmt='PNG')
         if not images:
             raise ValueError("No images extracted from PDF")
+        # Log how long the conversion took
+        logging.debug(f"PDF to images conversion completed in {time.time() - start_time} seconds.")
         return images
     except Exception as e:
+        logging.error(f"PDF conversion error: {str(e)}")
         return None
 def process_single_image(image):
+    """Process a single image with detailed logging"""
     try:
+        logging.debug("Starting image processing...")
+        start_time = time.time()
+        # Ensure 'img' is properly assigned, handling both PDFs and images
+        if isinstance(image, Image.Image):  # if it's a PIL Image
+            img = image
+        else:
+            raise ValueError("Uploaded file is not a valid image.")
+        # OCR processing and classification
+        extracted_text = reader.readtext(np.array(img))
+        extracted_text = ' '.join([text[1] for text in extracted_text])
+        validation_result = text_classifier(extracted_text[:512])
+        sentiment = validation_result[0]['label']
+        confidence = validation_result[0]['score']
+        doc_result = doc_classifier(img)
+        doc_type = doc_result[0]['label']
+        doc_confidence = doc_result[0]['score']
+        # Log how long processing took
+        logging.debug(f"Image processing completed in {time.time() - start_time} seconds.")
+        return {
+            'text': extracted_text,
+            'validation': sentiment,
+            'validation_confidence': confidence,
+            'doc_type': doc_type,
+            'doc_confidence': doc_confidence
+        }
     except Exception as e:
+        logging.error(f"Error processing the image: {str(e)}")
+        return {'error': f"Error processing the image: {str(e)}"}
+# Gradio interface setup
+def gradio_interface(input_file):
+    """Handle both PDF and image files uploaded by the user"""
     try:
+        if input_file.name.lower().endswith('.pdf'):
+            images = convert_pdf_to_images(input_file)
+            if images is None:
+                return {'error': 'Invalid PDF or unable to extract images'}
+            result = process_single_image(images[0])
+        elif input_file.name.lower().endswith(('png', 'jpg', 'jpeg')):
+            img = Image.open(input_file)
+            result = process_single_image(img)
+        else:
+            return {'error': 'Unsupported file type. Please upload a valid image or PDF.'}
+        return result
     except Exception as e:
+        logging.error(f"Error in file processing: {str(e)}")
+        return {'error': f"Error processing the file: {str(e)}"}
 # Create the Gradio interface with improved UI and error handling
 iface = gr.Interface(
+    fn=gradio_interface,
     inputs=[
         gr.File(
             label="Upload Insurance Document",