anujakkulkarni commited on
Commit
2aa2caf
Β·
verified Β·
1 Parent(s): 0da7d43

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -13
app.py CHANGED
@@ -15,6 +15,8 @@ from fastapi.middleware.cors import CORSMiddleware
15
  from fastapi.responses import JSONResponse
16
  from starlette.requests import Request
17
  import fitz # PyMuPDF
 
 
18
 
19
  # Azure Blob Storage
20
  try:
@@ -31,8 +33,7 @@ except ImportError:
31
 
32
  # Google Gemini - optional import
33
  try:
34
- import google.generativeai as genai
35
- from PIL import Image
36
  GEMINI_AVAILABLE = True
37
  except ImportError:
38
  GEMINI_AVAILABLE = False
@@ -875,7 +876,7 @@ async def split_invoices(
875
  max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
876
  ):
877
  """
878
- ⭐ OPTIMIZED INVOICE SPLITTER WITH AZURE BLOB STORAGE
879
 
880
  Performance Improvements:
881
  - Parallel Gemini API calls (5-10x faster for image PDFs)
@@ -883,25 +884,57 @@ async def split_invoices(
883
  - Reduced image resolution for faster processing
884
  - Optimized prompts for quicker responses
885
 
 
 
 
 
886
  Folder Structure in Blob Storage:
887
  POD/
888
  └── {batch_id}/
889
  └── {filename}/
890
- β”œβ”€β”€ Raw/ (original uploaded PDF)
891
  └── Splitted/ (individual split invoice PDFs)
892
 
893
  Required Parameters:
894
- - file: PDF file to upload
895
  - batch_id: Batch identifier (used for folder structure)
896
 
897
  Returns:
898
  - All invoice URLs with proper folder paths
899
  """
900
 
901
- # Validation
902
- if not file.filename.lower().endswith(".pdf"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
903
  raise HTTPException(
904
- status_code=400, detail="Only PDF files are supported")
 
 
 
 
 
 
 
 
 
 
 
905
 
906
  # Check blob storage
907
  if use_blob_storage and not get_blob_service_client():
@@ -917,16 +950,19 @@ async def split_invoices(
917
 
918
  # Stream upload to temp file
919
  max_size_bytes = max_file_size_mb * 1024 * 1024
920
- fd, temp_path = tempfile.mkstemp(suffix=".pdf")
921
  os.close(fd)
922
 
923
  doc = None
924
  original_pdf_bytes = None
925
  start_time = datetime.now()
 
 
926
 
927
  try:
928
  print(f"\n{'='*70}")
929
  print(f"πŸ“₯ Processing: {file.filename}")
 
930
  print(f" Batch ID: {batch_id}")
931
  print(
932
  f" Performance Mode: {'Smart Sampling' if use_smart_sampling else f'Parallel ({parallel_batch_size} workers)'}")
@@ -946,15 +982,50 @@ async def split_invoices(
946
  file_size_mb = total_size / (1024 * 1024)
947
  print(f"πŸ’Ύ File size: {file_size_mb:.2f}MB")
948
 
949
- # Read original PDF bytes
950
- with open(temp_path, "rb") as f:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
951
  original_pdf_bytes = f.read()
952
 
953
  # Upload original PDF to Raw folder
954
  raw_pdf_info = None
955
  if use_blob_storage:
956
  try:
957
- print(f"\nπŸ“€ Uploading original PDF to Raw folder...")
958
  raw_pdf_info = upload_raw_pdf_to_blob(
959
  original_pdf_bytes,
960
  file.filename,
@@ -966,7 +1037,7 @@ async def split_invoices(
966
  print(f"⚠️ Failed to upload raw PDF: {e}")
967
 
968
  # Open PDF for processing
969
- doc = fitz.open(temp_path)
970
  if doc.page_count == 0:
971
  raise HTTPException(status_code=400, detail="Empty PDF")
972
 
@@ -1159,7 +1230,12 @@ async def split_invoices(
1159
  # Close document
1160
  doc.close()
1161
  doc = None
 
 
1162
  remove_file(temp_path)
 
 
 
1163
  gc.collect()
1164
 
1165
  # Calculate total processing time
@@ -1177,9 +1253,11 @@ async def split_invoices(
1177
  },
1178
  "source_file": {
1179
  "name": file.filename,
 
1180
  "size_mb": round(file_size_mb, 2),
1181
  "total_pages": total_pages_count,
1182
  "pdf_type": "image-based" if is_image_pdf else "text-based",
 
1183
  "raw_pdf": raw_pdf_info
1184
  },
1185
  "summary": {
@@ -1202,6 +1280,9 @@ async def split_invoices(
1202
  print(f"\n{'='*70}")
1203
  print(f"βœ… SUCCESS!")
1204
  print(f" Batch ID: {batch_id}")
 
 
 
1205
  print(
1206
  f" Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
1207
  print(f" Split invoices: {len(all_parts)}")
@@ -1224,6 +1305,8 @@ async def split_invoices(
1224
  if doc:
1225
  doc.close()
1226
  remove_file(temp_path)
 
 
1227
  gc.collect()
1228
 
1229
 
 
15
  from fastapi.responses import JSONResponse
16
  from starlette.requests import Request
17
  import fitz # PyMuPDF
18
+ import google.generativeai as genai
19
+ from PIL import Image
20
 
21
  # Azure Blob Storage
22
  try:
 
33
 
34
  # Google Gemini - optional import
35
  try:
36
+
 
37
  GEMINI_AVAILABLE = True
38
  except ImportError:
39
  GEMINI_AVAILABLE = False
 
876
  max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
877
  ):
878
  """
879
+ ⭐ OPTIMIZED INVOICE SPLITTER - SUPPORTS PDF AND IMAGES
880
 
881
  Performance Improvements:
882
  - Parallel Gemini API calls (5-10x faster for image PDFs)
 
884
  - Reduced image resolution for faster processing
885
  - Optimized prompts for quicker responses
886
 
887
+ File Support:
888
+ - PDF files (text-based or image-based)
889
+ - Image files (PNG, JPG, JPEG, TIFF, BMP) - auto-converted to PDF
890
+
891
  Folder Structure in Blob Storage:
892
  POD/
893
  └── {batch_id}/
894
  └── {filename}/
895
+ β”œβ”€β”€ Raw/ (original uploaded file)
896
  └── Splitted/ (individual split invoice PDFs)
897
 
898
  Required Parameters:
899
+ - file: PDF or image file to upload
900
  - batch_id: Batch identifier (used for folder structure)
901
 
902
  Returns:
903
  - All invoice URLs with proper folder paths
904
  """
905
 
906
+ # ============================================================================
907
+ # ENHANCED VALIDATION - ACCEPT PDF AND IMAGES
908
+ # ============================================================================
909
+
910
+ if not file.filename:
911
+ raise HTTPException(status_code=400, detail="No filename provided")
912
+
913
+ filename_lower = file.filename.lower()
914
+
915
+ # Supported formats
916
+ SUPPORTED_EXTENSIONS = ['.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
917
+
918
+ file_extension = None
919
+ for ext in SUPPORTED_EXTENSIONS:
920
+ if filename_lower.endswith(ext):
921
+ file_extension = ext
922
+ break
923
+
924
+ if not file_extension:
925
  raise HTTPException(
926
+ status_code=400,
927
+ detail=f"Unsupported file format. Supported: PDF, PNG, JPG, JPEG, TIFF, BMP"
928
+ )
929
+
930
+ is_image_file = file_extension in ['.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
931
+
932
+ # Check PIL availability for image files
933
+ if is_image_file and not GEMINI_AVAILABLE:
934
+ raise HTTPException(
935
+ status_code=500,
936
+ detail="Image processing requires PIL. Install: pip install Pillow"
937
+ )
938
 
939
  # Check blob storage
940
  if use_blob_storage and not get_blob_service_client():
 
950
 
951
  # Stream upload to temp file
952
  max_size_bytes = max_file_size_mb * 1024 * 1024
953
+ fd, temp_path = tempfile.mkstemp(suffix=file_extension)
954
  os.close(fd)
955
 
956
  doc = None
957
  original_pdf_bytes = None
958
  start_time = datetime.now()
959
+ pdf_path = temp_path
960
+ original_filename = file.filename
961
 
962
  try:
963
  print(f"\n{'='*70}")
964
  print(f"πŸ“₯ Processing: {file.filename}")
965
+ print(f" File Type: {'IMAGE' if is_image_file else 'PDF'}")
966
  print(f" Batch ID: {batch_id}")
967
  print(
968
  f" Performance Mode: {'Smart Sampling' if use_smart_sampling else f'Parallel ({parallel_batch_size} workers)'}")
 
982
  file_size_mb = total_size / (1024 * 1024)
983
  print(f"πŸ’Ύ File size: {file_size_mb:.2f}MB")
984
 
985
+ # ============================================================================
986
+ # IMAGE TO PDF CONVERSION
987
+ # ============================================================================
988
+
989
+ if is_image_file:
990
+ print(f"πŸ–ΌοΈ Converting image to PDF...")
991
+ try:
992
+ from PIL import Image as PILImage
993
+
994
+ # Open image and convert to PDF
995
+ img = PILImage.open(temp_path)
996
+
997
+ # Convert to RGB if necessary (for RGBA, grayscale, etc.)
998
+ if img.mode != 'RGB':
999
+ img = img.convert('RGB')
1000
+
1001
+ # Create PDF path
1002
+ pdf_path = temp_path.replace(file_extension, '.pdf')
1003
+
1004
+ # Save as PDF
1005
+ img.save(pdf_path, 'PDF', resolution=100.0)
1006
+ img.close()
1007
+
1008
+ print(f"βœ… Image converted to PDF")
1009
+
1010
+ # Update filename for storage
1011
+ file.filename = file.filename.replace(file_extension, '.pdf')
1012
+
1013
+ except Exception as e:
1014
+ print(f"❌ Image conversion failed: {e}")
1015
+ raise HTTPException(
1016
+ status_code=500,
1017
+ detail=f"Failed to convert image to PDF: {str(e)}"
1018
+ )
1019
+
1020
+ # Read PDF bytes (either original or converted)
1021
+ with open(pdf_path, "rb") as f:
1022
  original_pdf_bytes = f.read()
1023
 
1024
  # Upload original PDF to Raw folder
1025
  raw_pdf_info = None
1026
  if use_blob_storage:
1027
  try:
1028
+ print(f"\nπŸ“€ Uploading original {'PDF' if not is_image_file else 'converted PDF'} to Raw folder...")
1029
  raw_pdf_info = upload_raw_pdf_to_blob(
1030
  original_pdf_bytes,
1031
  file.filename,
 
1037
  print(f"⚠️ Failed to upload raw PDF: {e}")
1038
 
1039
  # Open PDF for processing
1040
+ doc = fitz.open(pdf_path)
1041
  if doc.page_count == 0:
1042
  raise HTTPException(status_code=400, detail="Empty PDF")
1043
 
 
1230
  # Close document
1231
  doc.close()
1232
  doc = None
1233
+
1234
+ # Clean up temp files
1235
  remove_file(temp_path)
1236
+ if pdf_path != temp_path:
1237
+ remove_file(pdf_path)
1238
+
1239
  gc.collect()
1240
 
1241
  # Calculate total processing time
 
1253
  },
1254
  "source_file": {
1255
  "name": file.filename,
1256
+ "original_name": original_filename,
1257
  "size_mb": round(file_size_mb, 2),
1258
  "total_pages": total_pages_count,
1259
  "pdf_type": "image-based" if is_image_pdf else "text-based",
1260
+ "was_converted": is_image_file,
1261
  "raw_pdf": raw_pdf_info
1262
  },
1263
  "summary": {
 
1280
  print(f"\n{'='*70}")
1281
  print(f"βœ… SUCCESS!")
1282
  print(f" Batch ID: {batch_id}")
1283
+ print(f" Original File: {original_filename}")
1284
+ if is_image_file:
1285
+ print(f" βœ“ Image converted to PDF")
1286
  print(
1287
  f" Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
1288
  print(f" Split invoices: {len(all_parts)}")
 
1305
  if doc:
1306
  doc.close()
1307
  remove_file(temp_path)
1308
+ if pdf_path != temp_path:
1309
+ remove_file(pdf_path)
1310
  gc.collect()
1311
 
1312