Spaces:

blessedpug
/

Demo_Build

Sleeping

App Files Files Community

blessedpug commited on May 29, 2025

Commit

a1a13bb

1 Parent(s): 8b27fa0

Implemented FastAPI endpoints - Implemented batch processing for pdf forms

Browse files

Files changed (4) hide show

app.py +11 -10
data.json +88 -0
main.py +216 -0
pipeline.py +103 -57

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from pipeline import extract_info_batch, extract_child_fee_info, extract_medical_info, extract_medical_info_batch
 from PIL import Image
@@ -20,6 +20,8 @@ with gr.Blocks() as demo:
                     )
                 with gr.Column(scale=2):
                     batch_output_box = gr.Markdown(
                         value="Upload Images to extract information",
                         label="Batch Extracted Info",
@@ -37,13 +39,12 @@ with gr.Blocks() as demo:
         with gr.Tab("Reimbursement Forms"):
             with gr.Row():
                 with gr.Column(scale=2):
-                    img_input = gr.Image(
-                        type="pil",
-                        label="Image Upload",
-                        elem_id="upload-img",
-                        show_label=False,
-                        height=512,
-                        width=512
                     )
                 with gr.Column(scale=2):
@@ -65,8 +66,8 @@ with gr.Blocks() as demo:
                     upload_btn.click(
-                        fn=extract_child_fee_info,
-                        inputs=[img_input, emp_name, emp_code, department,form_name],
                         outputs=preview_output
                     )

 import gradio as gr
+from pipeline import extract_info_batch, extract_reimbursement_form_info, extract_medical_info, extract_medical_info_batch
 from PIL import Image
                     )
                 with gr.Column(scale=2):
+                    gr.Markdown("## Receipt Reimbursement Portal")
                     batch_output_box = gr.Markdown(
                         value="Upload Images to extract information",
                         label="Batch Extracted Info",
         with gr.Tab("Reimbursement Forms"):
             with gr.Row():
                 with gr.Column(scale=2):
+                    reimbursement_img_input = gr.File(
+                        file_types=["image"],
+                        label="Batch Image Upload",
+                        elem_id="batch-upload-img",
+                        show_label=True,
+                        file_count="multiple"
                     )
                 with gr.Column(scale=2):
                     upload_btn.click(
+                        fn=extract_reimbursement_form_info,
+                        inputs=[reimbursement_img_input, emp_name, emp_code, department,form_name],
                         outputs=preview_output
                     )

data.json CHANGED Viewed

@@ -90,5 +90,93 @@
                 "amount": 9.48
             }
         ]
     }
 ]

                 "amount": 9.48
             }
         ]
+    },
+    {
+        "fraud_check": [],
+        "merchant": "CSH Pharmacy",
+        "date": "17/01/2025",
+        "total_amount": 500.0,
+        "items": [
+            {
+                "description": "Arinac Tab (w)",
+                "amount": 8.6
+            },
+            {
+                "description": "Tarivid 200mg Tab (w)",
+                "amount": 301.8
+            },
+            {
+                "description": "Soflin 10mg Tab 30,s",
+                "amount": 18.05
+            }
+        ]
+    },
+    {
+        "fraud_check": [],
+        "merchant": "CSH Pharmacy",
+        "date": "17/01/2025",
+        "total_amount": 1449.0,
+        "items": [
+            {
+                "description": "Bofalgan 1g/100ml Inj (w)",
+                "amount": 225.0
+            },
+            {
+                "description": "Oxidil Inj 1gm Iv N/p",
+                "amount": 450.0
+            },
+            {
+                "description": "Drip Sot Max Care",
+                "amount": 200.0
+            },
+            {
+                "description": "10cc Syringe (smd)",
+                "amount": 50.0
+            },
+            {
+                "description": "9% 100ml Medisol",
+                "amount": 93.23
+            },
+            {
+                "description": "Iv Branula 24g (b Braun) W B",
+                "amount": 430.0
+            }
+        ]
+    },
+    {
+        "fraud_check": [],
+        "merchant": "CHUGHTAI PHARMACY",
+        "date": "15/01/2025",
+        "total_amount": 1394.0,
+        "items": [
+            {
+                "description": "N/s Plasamine inf 0.9% 100ml",
+                "amount": 102.0
+            },
+            {
+                "description": "Drip Set (classic)",
+                "amount": 150.0
+            },
+            {
+                "description": "B. Braun Branula Introcan 24 G",
+                "amount": 430.0
+            },
+            {
+                "description": "Apple Syringe 10cc 100s",
+                "amount": 390.0
+            },
+            {
+                "description": "Oxidil inj 1 W 1 Gm 1 Vial",
+                "amount": 352.0
+            },
+            {
+                "description": "Neurobion inj 25 Ampx3",
+                "amount": 120.0
+            },
+            {
+                "description": "Bofalgan inj 1 Ampx100 ml",
+                "amount": 207.0
+            }
+        ]
     }
 ]

main.py ADDED Viewed

	@@ -0,0 +1,216 @@

+from fastapi import FastAPI, File, UploadFile, Form, HTTPException
+from fastapi.responses import JSONResponse, FileResponse
+from typing import List, Optional
+from PIL import Image
+import tempfile
+import os
+import shutil
+import json # Added json import
+# Corrected and consolidated imports from pipeline
+from pipeline import (
+    extract_info,
+    # extract_info_batch, # This function in pipeline.py takes file paths, FastAPI will call extract_info individually
+    extract_reimbursement_form_info,
+    extract_medical_info,
+    extract_medical_info_batch
+)
+# Assuming models.py contains necessary Pydantic models, though not directly used in this file for request validation beyond FastAPI's
+# from models import ReceiptData, ChildFeeForm
+app = FastAPI()
+# Ensure output directory exists
+os.makedirs("outputs", exist_ok=True)
+@app.get("/")
+async def read_root():
+    return {"message": "Welcome to the Document Processing API"}
+@app.post("/extract_receipt_info_batch/")
+async def extract_receipt_batch_endpoint(files: List[UploadFile] = File(...)):
+    results = []
+    if not files:
+        raise HTTPException(status_code=400, detail="No files uploaded.")
+    for file_upload in files: # Renamed to avoid conflict
+        try:
+            if not file_upload.content_type.startswith("image/"):
+                results.append({"filename": file_upload.filename, "error": "File is not an image."})
+                continue
+            pil_image = Image.open(file_upload.file)
+            result_json_str = extract_info(pil_image)
+            if result_json_str.startswith("```json"):
+                actual_json_content = result_json_str[7:-4].strip()
+                results.append({"filename": file_upload.filename, "data": json.loads(actual_json_content)})
+            else:
+                 results.append({"filename": file_upload.filename, "data": json.loads(result_json_str)})
+        except Exception as e:
+            results.append({"filename": file_upload.filename, "error": str(e)})
+        finally:
+            file_upload.file.close() # Ensure file is closed
+    return JSONResponse(content=results)
+@app.post("/extract_reimbursement_form_batch/")
+async def extract_reimbursement_form_batch_endpoint(
+    files: List[UploadFile] = File(...),
+    emp_name: str = Form(...),
+    emp_code: str = Form(...),
+    department: str = Form(...),
+    form_name: str = Form(...)
+):
+    pil_images = []
+    if not files:
+        raise HTTPException(status_code=400, detail="No files uploaded for child fee processing.")
+    for file_upload in files:
+        try:
+            if not file_upload.content_type.startswith("image/"):
+                # Consider how to handle mix of valid/invalid files; for now, error out
+                raise HTTPException(status_code=400, detail=f"File '{file_upload.filename}' is not an image.")
+            pil_images.append(Image.open(file_upload.file))
+        except Exception as e: # Catch error during Image.open or content_type check
+            # Clean up already opened files if any before raising
+            for uploaded_file_obj in files: # Close all originally uploaded file objects
+                 if hasattr(uploaded_file_obj, 'file') and not uploaded_file_obj.file.closed:
+                    uploaded_file_obj.file.close()
+            raise HTTPException(status_code=400, detail=f"Error processing file '{file_upload.filename}': {str(e)}")
+        # We don't close file_upload.file here, Image.open() might keep it open or it might be closed by PIL.
+        # The finally block will handle closing all files.
+    if not pil_images: # Should be caught by `if not files` or the loop erroring, but as a safeguard.
+        raise HTTPException(status_code=400, detail="No valid images could be processed.")
+    try:
+        pdf_path = extract_reimbursement_form_info(
+            img_inputs=pil_images,
+            emp_name=emp_name,
+            emp_code=emp_code,
+            department=department,
+            form_name=form_name
+        )
+        if pdf_path and os.path.exists(pdf_path):
+            return FileResponse(pdf_path, media_type='application/pdf', filename=os.path.basename(pdf_path))
+        else:
+            # This implies extract_reimbursement_form_info returned None (e.g. no items extracted, or PDF gen error)
+            raise HTTPException(status_code=500, detail="Failed to generate PDF. No items might have been extracted or an internal error occurred.")
+    except HTTPException as he:
+        raise he
+    except Exception as e:
+        return JSONResponse(status_code=500, content={"error": "Failed to process child fee form batch", "detail": str(e)})
+    finally:
+        # Ensure all uploaded files are closed
+        for file_upload in files:
+            if hasattr(file_upload, 'file') and not file_upload.file.closed:
+                file_upload.file.close()
+@app.post("/extract_medical_info_batch/")
+async def extract_medical_batch_endpoint(
+    files: List[UploadFile] = File(...),
+    emp_name: str = Form(...),
+    emp_code: str = Form(...),
+    department: str = Form(...),
+    designation: str = Form(...),
+    company: str = Form(...),
+    extension_no: str = Form(...)
+):
+    if not files:
+        raise HTTPException(status_code=400, detail="No files uploaded.")
+    temp_files_info = []
+    temp_dir = tempfile.mkdtemp()
+    try:
+        for file_upload in files:
+            if not file_upload.content_type.startswith("image/"):
+                # Clean up for this specific error case
+                for temp_info_obj in temp_files_info: # Iterate over created MockFileObject
+                    if os.path.exists(temp_info_obj.name):
+                         os.remove(temp_info_obj.name)
+                if os.path.exists(temp_dir):
+                    shutil.rmtree(temp_dir)
+                raise HTTPException(status_code=400, detail=f"File '{file_upload.filename}' is not an image.")
+            temp_file_path = ""
+            try:
+                # Ensure filename is somewhat safe for path joining, though mkdtemp helps isolate
+                safe_filename = os.path.basename(file_upload.filename) if file_upload.filename else "unknown_file"
+                temp_file_path = os.path.join(temp_dir, safe_filename)
+                with open(temp_file_path, "wb") as f_temp:
+                    shutil.copyfileobj(file_upload.file, f_temp)
+                class MockFileObject: # Defined inside or ensure it's available
+                    def __init__(self, path, original_filename):
+                        self.name = path
+                        self.original_filename = original_filename
+                temp_files_info.append(MockFileObject(temp_file_path, file_upload.filename))
+            finally:
+                file_upload.file.close()
+        if not temp_files_info:
+             if os.path.exists(temp_dir): # Cleanup if no valid files were processed
+                shutil.rmtree(temp_dir)
+             raise HTTPException(status_code=400, detail="No valid image files to process after filtering.")
+        html_path = extract_medical_info_batch(
+            image_file_list=temp_files_info, # Pass list of MockFileObjects
+            emp_name=emp_name,
+            emp_code=emp_code,
+            department=department,
+            designation=designation,
+            company=company,
+            extension_no=extension_no
+        )
+        if html_path and os.path.exists(html_path):
+            status_code_to_return = 200
+            if "error_no_medical_form_images" in os.path.basename(html_path):
+                status_code_to_return = 400
+            return FileResponse(html_path, media_type='text/html', filename=os.path.basename(html_path), status_code=status_code_to_return)
+        else:
+            raise HTTPException(status_code=500, detail="Failed to generate consolidated HTML medical form. The batch function may have returned an invalid path or None, or the file doesn't exist.")
+    except HTTPException as he:
+        # General cleanup for HTTPExceptions raised within the main try
+        for temp_info_obj in temp_files_info:
+            if os.path.exists(temp_info_obj.name):
+                 os.remove(temp_info_obj.name)
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+        raise he
+    except Exception as e:
+        # General cleanup for other exceptions
+        for temp_info_obj in temp_files_info:
+            if os.path.exists(temp_info_obj.name):
+                 os.remove(temp_info_obj.name)
+        if os.path.exists(temp_dir):
+            shutil.rmtree(temp_dir)
+        return JSONResponse(status_code=500, content={"error": "Failed to process batch medical forms", "detail": str(e)})
+    finally:
+        # This finally block attempts cleanup again, belt-and-suspenders.
+        # It's particularly for the temp_dir itself if not cleaned by specific error handlers.
+        # Individual files in temp_files_info should ideally be cleaned by the except blocks.
+        if 'temp_dir' in locals() and os.path.exists(temp_dir):
+            # Aggressively try to clean contents if not already done
+            for item_name in os.listdir(temp_dir):
+                item_path = os.path.join(temp_dir, item_name)
+                try:
+                    if os.path.isfile(item_path) or os.path.islink(item_path):
+                        os.unlink(item_path)
+                    elif os.path.isdir(item_path): # Should not happen if temp_files are files
+                        shutil.rmtree(item_path)
+                except Exception as e_clean_item:
+                    print(f"Error cleaning up item {item_path} in temp_dir: {e_clean_item}")
+            try:
+                shutil.rmtree(temp_dir) # Remove the directory itself
+            except Exception as e_clean_dir:
+                print(f"Error final cleanup of temp directory {temp_dir}: {e_clean_dir}")
+# Ensure no trailing comments like "# We will add more endpoints below"

pipeline.py CHANGED Viewed

@@ -10,6 +10,7 @@ from form_fill import fill_child_fee_pdf, fill_medical_pdf
 from fraud import process_receipt
 from datetime import datetime
 import html
 load_dotenv()
@@ -24,7 +25,7 @@ reciept_system_prompt = (
     "    description: str\n"
     "    amount: float\n\n"
     "class FraudData(BaseModel):\n"
-    "    fraud_detected: bool \n"
     "    fraud_type: Optional[str] = None  # Type of fraud if detected, e.g., \"duplicate\", \"suspicious\" \n\n"
     "class ReceiptData(BaseModel):\n"
     "    fraud_check: Optional[List[FraudData]] = []  # Optional field for fraud detection, always set to empty list\n"
@@ -145,83 +146,128 @@ def extract_info(pil_img):
         return f"```json\n{json.dumps({'error': str(e), 'raw_output': raw_output}, indent=2)}\n```"
-def extract_child_fee_info(img_input, emp_name, emp_code, department, form_name):
-    print(emp_name, emp_code, department)
-    processed_image = preprocess_image(img_input)
-    img_bytes = pil_to_bytes(processed_image)
-    img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
-    response = openai.chat.completions.create(
-        model="gpt-4o",
-        messages=[
-            {"role": "system", "content": fee_bill_system_prompt},
-            {"role": "user",
-             "content": [
-                {"type": "text", "text": "Here is a child fee bill image:"},
-                {"type": "image_url", "image_url": {"url": "data:image/png;base64," + img_base64}}
-             ]}
-        ]
-    )
-    raw_output = response.choices[0].message.content
-    try:
-        if raw_output.startswith("```"):
-            raw_output = raw_output.strip("` \n")
-            if raw_output.startswith("json"):
-                raw_output = raw_output[4:].strip()
-        data = json.loads(raw_output)
-        print(data)
-        # Validate if needed:
-        # ChildFeeForm(**data)
-        # Extract bill_month from first item if available, else use empty string
-        items = data.get("items", [])
-        bill_month = ""
-        if items and "bill_month" in items[0]:
-            bill_month = items[0]["bill_month"]
-        os.makedirs("outputs", exist_ok=True)
-        if form_name == "Child Fee Reimbursement Form":
-            output_pdf_path = f"outputs/filled_child_fee_reimbursement_form_{uuid.uuid4().hex}.pdf"
-        elif form_name == "Internet Charges Form":
-            output_pdf_path = f"outputs/filled_internet_charges_reimbursement_form_{uuid.uuid4().hex}.pdf"
-        elif form_name == "Mobile Reimbursement Form":
-            output_pdf_path = f"outputs/filled_mobile_reimbursement_form_{uuid.uuid4().hex}.pdf"
         filled_pdf_path = fill_child_fee_pdf(
             template_pdf_path="templates/REIMBURSEMENT FORM.pdf",
             output_pdf_path=output_pdf_path,
             emp_name=emp_name,
             emp_code=emp_code,
             department=department,
-            bill_month=bill_month,
-            items=items,
-            total=data.get("total", "")
         )
-        return filled_pdf_path # Return path to Gradio for download
     except Exception as e:
-        print("ERROR:", e)
-        return None  # or f"Error: {str(e)}"
-def extract_info_batch(file_list):
-    """
-    Accepts a list of file objects/paths, processes each as a PIL image, and returns results.
-    """
-    results = []
-    for file in file_list:
-        img = Image.open(file)
-        results.append(extract_info(img))
-    return "\n\n".join(results)

 from fraud import process_receipt
 from datetime import datetime
 import html
+from typing import List
 load_dotenv()
     "    description: str\n"
     "    amount: float\n\n"
     "class FraudData(BaseModel):\n"
+    "    fraud_detected: bool # either True or False\n"
     "    fraud_type: Optional[str] = None  # Type of fraud if detected, e.g., \"duplicate\", \"suspicious\" \n\n"
     "class ReceiptData(BaseModel):\n"
     "    fraud_check: Optional[List[FraudData]] = []  # Optional field for fraud detection, always set to empty list\n"
         return f"```json\n{json.dumps({'error': str(e), 'raw_output': raw_output}, indent=2)}\n```"
+def extract_info_batch(file_list):
+    """
+    Accepts a list of file objects/paths, processes each as a PIL image, and returns results.
+    """
+    results = []
+    for file in file_list:
+        img = Image.open(file)
+        results.append(extract_info(img))
+    return "\n\n".join(results)
+def extract_reimbursement_form_info(img_inputs: List[Image.Image], emp_name: str, emp_code: str, department: str, form_name: str):
+    print(f"Processing child fee info for: {emp_name}, {emp_code}, {department}, Form: {form_name}")
+    consolidated_items = []
+    consolidated_total = 0.0
+    first_bill_month_found = ""
+    processed_image_count = 0
+    for i, img_input_item in enumerate(img_inputs):
+        print(f"Processing image {i+1} of {len(img_inputs)} for child fee form...")
+        try:
+            current_pil_img = None
+            if isinstance(img_input_item, Image.Image):
+                current_pil_img = img_input_item
+            else:
+                # Assume img_input_item is a path, filename, or a file-like object
+                # that Image.open() can handle (like Gradio's NamedString if it behaves like a path or has a read method)
+                current_pil_img = Image.open(img_input_item)
+            processed_image = preprocess_image(current_pil_img)
+            img_bytes = pil_to_bytes(processed_image)
+            img_base64 = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
+            response = openai.chat.completions.create(
+                model="gpt-4o",
+                messages=[
+                    {"role": "system", "content": fee_bill_system_prompt},
+                    {"role": "user",
+                     "content": [
+                        {"type": "text", "text": f"Here is a child fee bill image (part {i+1} of a batch):"},
+                        {"type": "image_url", "image_url": {"url": "data:image/png;base64," + img_base64}}
+                     ]}
+                ]
+            )
+            raw_output = response.choices[0].message.content
+            print(f"Raw output from LLM for image {i+1}: {raw_output}")
+            if raw_output.startswith("```"):
+                raw_output = raw_output.strip("` \n")
+                if raw_output.startswith("json"):
+                    raw_output = raw_output[4:].strip()
+            data = json.loads(raw_output)
+            print(f"Parsed data from LLM for image {i+1}: {data}")
+            current_items = data.get("items", [])
+            if current_items:
+                consolidated_items.extend(current_items)
+                # Summing up totals from each bill, or summing items directly for more accuracy
+                for item in current_items:
+                    consolidated_total += float(item.get("amount", 0) or 0)
+            if not first_bill_month_found and current_items and "bill_month" in current_items[0]:
+                first_bill_month_found = current_items[0]["bill_month"]
+            processed_image_count +=1
+        except Exception as e:
+            print(f"ERROR processing image {i+1} for child fee form: {e}")
+            # Decide if one error should stop the whole batch or just skip the problematic image
+            # For now, we skip and continue
+            continue
+    if not consolidated_items: # No items extracted from any image
+        print("No items were extracted from any of the provided images for child fee form.")
+        # Potentially return an error or an empty PDF/status message
+        # For now, let's create an empty PDF as the function expects to return a path
+        # Or, it might be better to return None and let the API endpoint handle the error response.
+        return None
+    print(f"Consolidated {len(consolidated_items)} items from {processed_image_count} images.")
+    print(f"Final total: {consolidated_total}, Bill month to use: {first_bill_month_found}")
+    os.makedirs("outputs", exist_ok=True)
+    # Adjust filename to indicate consolidation if multiple images were processed
+    file_suffix = f"{uuid.uuid4().hex}"
+    if len(img_inputs) > 1:
+        file_suffix = f"batch_{file_suffix}"
+    if form_name == "Child Fee Reimbursement Form":
+        output_pdf_path = f"outputs/filled_child_fee_reimbursement_form_{file_suffix}.pdf"
+    elif form_name == "Internet Charges Form":
+        output_pdf_path = f"outputs/filled_internet_charges_reimbursement_form_{file_suffix}.pdf"
+    elif form_name == "Mobile Reimbursement Form":
+        output_pdf_path = f"outputs/filled_mobile_reimbursement_form_{file_suffix}.pdf"
+    else: # Default or error case
+        output_pdf_path = f"outputs/filled_unknown_reimbursement_form_{file_suffix}.pdf"
+    try:
         filled_pdf_path = fill_child_fee_pdf(
             template_pdf_path="templates/REIMBURSEMENT FORM.pdf",
             output_pdf_path=output_pdf_path,
             emp_name=emp_name,
             emp_code=emp_code,
             department=department,
+            bill_month=first_bill_month_found,
+            items=consolidated_items, # Use consolidated items
+            total=consolidated_total # Use consolidated total
         )
+        return filled_pdf_path
     except Exception as e:
+        print(f"ERROR during PDF generation for consolidated child fee form: {e}")
+        return None