Spaces:

anujakkulkarni
/

splitpdffile

Sleeping

App Files Files Community

anujakkulkarni commited on 9 days ago

Commit

2aa2caf

verified ·

1 Parent(s): 0da7d43

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -13

app.py CHANGED Viewed

@@ -15,6 +15,8 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from starlette.requests import Request
 import fitz  # PyMuPDF
 # Azure Blob Storage
 try:
@@ -31,8 +33,7 @@ except ImportError:
 # Google Gemini - optional import
 try:
-    import google.generativeai as genai
-    from PIL import Image
     GEMINI_AVAILABLE = True
 except ImportError:
     GEMINI_AVAILABLE = False
@@ -875,7 +876,7 @@ async def split_invoices(
     max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
 ):
     """
-    ⭐ OPTIMIZED INVOICE SPLITTER WITH AZURE BLOB STORAGE
     Performance Improvements:
     - Parallel Gemini API calls (5-10x faster for image PDFs)
@@ -883,25 +884,57 @@ async def split_invoices(
     - Reduced image resolution for faster processing
     - Optimized prompts for quicker responses
     Folder Structure in Blob Storage:
     POD/
       └── {batch_id}/
            └── {filename}/
-                ├── Raw/ (original uploaded PDF)
                 └── Splitted/ (individual split invoice PDFs)
     Required Parameters:
-    - file: PDF file to upload
     - batch_id: Batch identifier (used for folder structure)
     Returns:
     - All invoice URLs with proper folder paths
     """
-    # Validation
-    if not file.filename.lower().endswith(".pdf"):
         raise HTTPException(
-            status_code=400, detail="Only PDF files are supported")
     # Check blob storage
     if use_blob_storage and not get_blob_service_client():
@@ -917,16 +950,19 @@ async def split_invoices(
     # Stream upload to temp file
     max_size_bytes = max_file_size_mb * 1024 * 1024
-    fd, temp_path = tempfile.mkstemp(suffix=".pdf")
     os.close(fd)
     doc = None
     original_pdf_bytes = None
     start_time = datetime.now()
     try:
         print(f"\n{'='*70}")
         print(f"📥 Processing: {file.filename}")
         print(f"   Batch ID: {batch_id}")
         print(
             f"   Performance Mode: {'Smart Sampling' if use_smart_sampling else f'Parallel ({parallel_batch_size} workers)'}")
@@ -946,15 +982,50 @@ async def split_invoices(
         file_size_mb = total_size / (1024 * 1024)
         print(f"💾 File size: {file_size_mb:.2f}MB")
-        # Read original PDF bytes
-        with open(temp_path, "rb") as f:
             original_pdf_bytes = f.read()
         # Upload original PDF to Raw folder
         raw_pdf_info = None
         if use_blob_storage:
             try:
-                print(f"\n📤 Uploading original PDF to Raw folder...")
                 raw_pdf_info = upload_raw_pdf_to_blob(
                     original_pdf_bytes,
                     file.filename,
@@ -966,7 +1037,7 @@ async def split_invoices(
                 print(f"⚠️ Failed to upload raw PDF: {e}")
         # Open PDF for processing
-        doc = fitz.open(temp_path)
         if doc.page_count == 0:
             raise HTTPException(status_code=400, detail="Empty PDF")
@@ -1159,7 +1230,12 @@ async def split_invoices(
         # Close document
         doc.close()
         doc = None
         remove_file(temp_path)
         gc.collect()
         # Calculate total processing time
@@ -1177,9 +1253,11 @@ async def split_invoices(
             },
             "source_file": {
                 "name": file.filename,
                 "size_mb": round(file_size_mb, 2),
                 "total_pages": total_pages_count,
                 "pdf_type": "image-based" if is_image_pdf else "text-based",
                 "raw_pdf": raw_pdf_info
             },
             "summary": {
@@ -1202,6 +1280,9 @@ async def split_invoices(
         print(f"\n{'='*70}")
         print(f"✅ SUCCESS!")
         print(f"   Batch ID: {batch_id}")
         print(
             f"   Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
         print(f"   Split invoices: {len(all_parts)}")
@@ -1224,6 +1305,8 @@ async def split_invoices(
         if doc:
             doc.close()
         remove_file(temp_path)
         gc.collect()

 from fastapi.responses import JSONResponse
 from starlette.requests import Request
 import fitz  # PyMuPDF
+import google.generativeai as genai
+from PIL import Image
 # Azure Blob Storage
 try:
 # Google Gemini - optional import
 try:
     GEMINI_AVAILABLE = True
 except ImportError:
     GEMINI_AVAILABLE = False
     max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
 ):
     """
+    ⭐ OPTIMIZED INVOICE SPLITTER - SUPPORTS PDF AND IMAGES
     Performance Improvements:
     - Parallel Gemini API calls (5-10x faster for image PDFs)
     - Reduced image resolution for faster processing
     - Optimized prompts for quicker responses
+    File Support:
+    - PDF files (text-based or image-based)
+    - Image files (PNG, JPG, JPEG, TIFF, BMP) - auto-converted to PDF
     Folder Structure in Blob Storage:
     POD/
       └── {batch_id}/
            └── {filename}/
+                ├── Raw/ (original uploaded file)
                 └── Splitted/ (individual split invoice PDFs)
     Required Parameters:
+    - file: PDF or image file to upload
     - batch_id: Batch identifier (used for folder structure)
     Returns:
     - All invoice URLs with proper folder paths
     """
+    # ============================================================================
+    # ENHANCED VALIDATION - ACCEPT PDF AND IMAGES
+    # ============================================================================
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No filename provided")
+    filename_lower = file.filename.lower()
+    # Supported formats
+    SUPPORTED_EXTENSIONS = ['.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
+    file_extension = None
+    for ext in SUPPORTED_EXTENSIONS:
+        if filename_lower.endswith(ext):
+            file_extension = ext
+            break
+    if not file_extension:
         raise HTTPException(
+            status_code=400,
+            detail=f"Unsupported file format. Supported: PDF, PNG, JPG, JPEG, TIFF, BMP"
+        )
+    is_image_file = file_extension in ['.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
+    # Check PIL availability for image files
+    if is_image_file and not GEMINI_AVAILABLE:
+        raise HTTPException(
+            status_code=500,
+            detail="Image processing requires PIL. Install: pip install Pillow"
+        )
     # Check blob storage
     if use_blob_storage and not get_blob_service_client():
     # Stream upload to temp file
     max_size_bytes = max_file_size_mb * 1024 * 1024
+    fd, temp_path = tempfile.mkstemp(suffix=file_extension)
     os.close(fd)
     doc = None
     original_pdf_bytes = None
     start_time = datetime.now()
+    pdf_path = temp_path
+    original_filename = file.filename
     try:
         print(f"\n{'='*70}")
         print(f"📥 Processing: {file.filename}")
+        print(f"   File Type: {'IMAGE' if is_image_file else 'PDF'}")
         print(f"   Batch ID: {batch_id}")
         print(
             f"   Performance Mode: {'Smart Sampling' if use_smart_sampling else f'Parallel ({parallel_batch_size} workers)'}")
         file_size_mb = total_size / (1024 * 1024)
         print(f"💾 File size: {file_size_mb:.2f}MB")
+        # ============================================================================
+        # IMAGE TO PDF CONVERSION
+        # ============================================================================
+        if is_image_file:
+            print(f"🖼️  Converting image to PDF...")
+            try:
+                from PIL import Image as PILImage
+                # Open image and convert to PDF
+                img = PILImage.open(temp_path)
+                # Convert to RGB if necessary (for RGBA, grayscale, etc.)
+                if img.mode != 'RGB':
+                    img = img.convert('RGB')
+                # Create PDF path
+                pdf_path = temp_path.replace(file_extension, '.pdf')
+                # Save as PDF
+                img.save(pdf_path, 'PDF', resolution=100.0)
+                img.close()
+                print(f"✅ Image converted to PDF")
+                # Update filename for storage
+                file.filename = file.filename.replace(file_extension, '.pdf')
+            except Exception as e:
+                print(f"❌ Image conversion failed: {e}")
+                raise HTTPException(
+                    status_code=500,
+                    detail=f"Failed to convert image to PDF: {str(e)}"
+                )
+        # Read PDF bytes (either original or converted)
+        with open(pdf_path, "rb") as f:
             original_pdf_bytes = f.read()
         # Upload original PDF to Raw folder
         raw_pdf_info = None
         if use_blob_storage:
             try:
+                print(f"\n📤 Uploading original {'PDF' if not is_image_file else 'converted PDF'} to Raw folder...")
                 raw_pdf_info = upload_raw_pdf_to_blob(
                     original_pdf_bytes,
                     file.filename,
                 print(f"⚠️ Failed to upload raw PDF: {e}")
         # Open PDF for processing
+        doc = fitz.open(pdf_path)
         if doc.page_count == 0:
             raise HTTPException(status_code=400, detail="Empty PDF")
         # Close document
         doc.close()
         doc = None
+        # Clean up temp files
         remove_file(temp_path)
+        if pdf_path != temp_path:
+            remove_file(pdf_path)
         gc.collect()
         # Calculate total processing time
             },
             "source_file": {
                 "name": file.filename,
+                "original_name": original_filename,
                 "size_mb": round(file_size_mb, 2),
                 "total_pages": total_pages_count,
                 "pdf_type": "image-based" if is_image_pdf else "text-based",
+                "was_converted": is_image_file,
                 "raw_pdf": raw_pdf_info
             },
             "summary": {
         print(f"\n{'='*70}")
         print(f"✅ SUCCESS!")
         print(f"   Batch ID: {batch_id}")
+        print(f"   Original File: {original_filename}")
+        if is_image_file:
+            print(f"   ✓ Image converted to PDF")
         print(
             f"   Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
         print(f"   Split invoices: {len(all_parts)}")
         if doc:
             doc.close()
         remove_file(temp_path)
+        if pdf_path != temp_path:
+            remove_file(pdf_path)
         gc.collect()