Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -15,6 +15,8 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
| 15 |
from fastapi.responses import JSONResponse
|
| 16 |
from starlette.requests import Request
|
| 17 |
import fitz # PyMuPDF
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# Azure Blob Storage
|
| 20 |
try:
|
|
@@ -31,8 +33,7 @@ except ImportError:
|
|
| 31 |
|
| 32 |
# Google Gemini - optional import
|
| 33 |
try:
|
| 34 |
-
|
| 35 |
-
from PIL import Image
|
| 36 |
GEMINI_AVAILABLE = True
|
| 37 |
except ImportError:
|
| 38 |
GEMINI_AVAILABLE = False
|
|
@@ -875,7 +876,7 @@ async def split_invoices(
|
|
| 875 |
max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
|
| 876 |
):
|
| 877 |
"""
|
| 878 |
-
β OPTIMIZED INVOICE SPLITTER
|
| 879 |
|
| 880 |
Performance Improvements:
|
| 881 |
- Parallel Gemini API calls (5-10x faster for image PDFs)
|
|
@@ -883,25 +884,57 @@ async def split_invoices(
|
|
| 883 |
- Reduced image resolution for faster processing
|
| 884 |
- Optimized prompts for quicker responses
|
| 885 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 886 |
Folder Structure in Blob Storage:
|
| 887 |
POD/
|
| 888 |
βββ {batch_id}/
|
| 889 |
βββ {filename}/
|
| 890 |
-
βββ Raw/ (original uploaded
|
| 891 |
βββ Splitted/ (individual split invoice PDFs)
|
| 892 |
|
| 893 |
Required Parameters:
|
| 894 |
-
- file: PDF file to upload
|
| 895 |
- batch_id: Batch identifier (used for folder structure)
|
| 896 |
|
| 897 |
Returns:
|
| 898 |
- All invoice URLs with proper folder paths
|
| 899 |
"""
|
| 900 |
|
| 901 |
-
#
|
| 902 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 903 |
raise HTTPException(
|
| 904 |
-
status_code=400,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 905 |
|
| 906 |
# Check blob storage
|
| 907 |
if use_blob_storage and not get_blob_service_client():
|
|
@@ -917,16 +950,19 @@ async def split_invoices(
|
|
| 917 |
|
| 918 |
# Stream upload to temp file
|
| 919 |
max_size_bytes = max_file_size_mb * 1024 * 1024
|
| 920 |
-
fd, temp_path = tempfile.mkstemp(suffix=
|
| 921 |
os.close(fd)
|
| 922 |
|
| 923 |
doc = None
|
| 924 |
original_pdf_bytes = None
|
| 925 |
start_time = datetime.now()
|
|
|
|
|
|
|
| 926 |
|
| 927 |
try:
|
| 928 |
print(f"\n{'='*70}")
|
| 929 |
print(f"π₯ Processing: {file.filename}")
|
|
|
|
| 930 |
print(f" Batch ID: {batch_id}")
|
| 931 |
print(
|
| 932 |
f" Performance Mode: {'Smart Sampling' if use_smart_sampling else f'Parallel ({parallel_batch_size} workers)'}")
|
|
@@ -946,15 +982,50 @@ async def split_invoices(
|
|
| 946 |
file_size_mb = total_size / (1024 * 1024)
|
| 947 |
print(f"πΎ File size: {file_size_mb:.2f}MB")
|
| 948 |
|
| 949 |
-
#
|
| 950 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 951 |
original_pdf_bytes = f.read()
|
| 952 |
|
| 953 |
# Upload original PDF to Raw folder
|
| 954 |
raw_pdf_info = None
|
| 955 |
if use_blob_storage:
|
| 956 |
try:
|
| 957 |
-
print(f"\nπ€ Uploading original PDF to Raw folder...")
|
| 958 |
raw_pdf_info = upload_raw_pdf_to_blob(
|
| 959 |
original_pdf_bytes,
|
| 960 |
file.filename,
|
|
@@ -966,7 +1037,7 @@ async def split_invoices(
|
|
| 966 |
print(f"β οΈ Failed to upload raw PDF: {e}")
|
| 967 |
|
| 968 |
# Open PDF for processing
|
| 969 |
-
doc = fitz.open(
|
| 970 |
if doc.page_count == 0:
|
| 971 |
raise HTTPException(status_code=400, detail="Empty PDF")
|
| 972 |
|
|
@@ -1159,7 +1230,12 @@ async def split_invoices(
|
|
| 1159 |
# Close document
|
| 1160 |
doc.close()
|
| 1161 |
doc = None
|
|
|
|
|
|
|
| 1162 |
remove_file(temp_path)
|
|
|
|
|
|
|
|
|
|
| 1163 |
gc.collect()
|
| 1164 |
|
| 1165 |
# Calculate total processing time
|
|
@@ -1177,9 +1253,11 @@ async def split_invoices(
|
|
| 1177 |
},
|
| 1178 |
"source_file": {
|
| 1179 |
"name": file.filename,
|
|
|
|
| 1180 |
"size_mb": round(file_size_mb, 2),
|
| 1181 |
"total_pages": total_pages_count,
|
| 1182 |
"pdf_type": "image-based" if is_image_pdf else "text-based",
|
|
|
|
| 1183 |
"raw_pdf": raw_pdf_info
|
| 1184 |
},
|
| 1185 |
"summary": {
|
|
@@ -1202,6 +1280,9 @@ async def split_invoices(
|
|
| 1202 |
print(f"\n{'='*70}")
|
| 1203 |
print(f"β
SUCCESS!")
|
| 1204 |
print(f" Batch ID: {batch_id}")
|
|
|
|
|
|
|
|
|
|
| 1205 |
print(
|
| 1206 |
f" Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
|
| 1207 |
print(f" Split invoices: {len(all_parts)}")
|
|
@@ -1224,6 +1305,8 @@ async def split_invoices(
|
|
| 1224 |
if doc:
|
| 1225 |
doc.close()
|
| 1226 |
remove_file(temp_path)
|
|
|
|
|
|
|
| 1227 |
gc.collect()
|
| 1228 |
|
| 1229 |
|
|
|
|
| 15 |
from fastapi.responses import JSONResponse
|
| 16 |
from starlette.requests import Request
|
| 17 |
import fitz # PyMuPDF
|
| 18 |
+
import google.generativeai as genai
|
| 19 |
+
from PIL import Image
|
| 20 |
|
| 21 |
# Azure Blob Storage
|
| 22 |
try:
|
|
|
|
| 33 |
|
| 34 |
# Google Gemini - optional import
|
| 35 |
try:
|
| 36 |
+
|
|
|
|
| 37 |
GEMINI_AVAILABLE = True
|
| 38 |
except ImportError:
|
| 39 |
GEMINI_AVAILABLE = False
|
|
|
|
| 876 |
max_file_size_mb: int = Form(200, description="Maximum file size in MB"),
|
| 877 |
):
|
| 878 |
"""
|
| 879 |
+
β OPTIMIZED INVOICE SPLITTER - SUPPORTS PDF AND IMAGES
|
| 880 |
|
| 881 |
Performance Improvements:
|
| 882 |
- Parallel Gemini API calls (5-10x faster for image PDFs)
|
|
|
|
| 884 |
- Reduced image resolution for faster processing
|
| 885 |
- Optimized prompts for quicker responses
|
| 886 |
|
| 887 |
+
File Support:
|
| 888 |
+
- PDF files (text-based or image-based)
|
| 889 |
+
- Image files (PNG, JPG, JPEG, TIFF, BMP) - auto-converted to PDF
|
| 890 |
+
|
| 891 |
Folder Structure in Blob Storage:
|
| 892 |
POD/
|
| 893 |
βββ {batch_id}/
|
| 894 |
βββ {filename}/
|
| 895 |
+
βββ Raw/ (original uploaded file)
|
| 896 |
βββ Splitted/ (individual split invoice PDFs)
|
| 897 |
|
| 898 |
Required Parameters:
|
| 899 |
+
- file: PDF or image file to upload
|
| 900 |
- batch_id: Batch identifier (used for folder structure)
|
| 901 |
|
| 902 |
Returns:
|
| 903 |
- All invoice URLs with proper folder paths
|
| 904 |
"""
|
| 905 |
|
| 906 |
+
# ============================================================================
|
| 907 |
+
# ENHANCED VALIDATION - ACCEPT PDF AND IMAGES
|
| 908 |
+
# ============================================================================
|
| 909 |
+
|
| 910 |
+
if not file.filename:
|
| 911 |
+
raise HTTPException(status_code=400, detail="No filename provided")
|
| 912 |
+
|
| 913 |
+
filename_lower = file.filename.lower()
|
| 914 |
+
|
| 915 |
+
# Supported formats
|
| 916 |
+
SUPPORTED_EXTENSIONS = ['.pdf', '.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
|
| 917 |
+
|
| 918 |
+
file_extension = None
|
| 919 |
+
for ext in SUPPORTED_EXTENSIONS:
|
| 920 |
+
if filename_lower.endswith(ext):
|
| 921 |
+
file_extension = ext
|
| 922 |
+
break
|
| 923 |
+
|
| 924 |
+
if not file_extension:
|
| 925 |
raise HTTPException(
|
| 926 |
+
status_code=400,
|
| 927 |
+
detail=f"Unsupported file format. Supported: PDF, PNG, JPG, JPEG, TIFF, BMP"
|
| 928 |
+
)
|
| 929 |
+
|
| 930 |
+
is_image_file = file_extension in ['.png', '.jpg', '.jpeg', '.tiff', '.tif', '.bmp']
|
| 931 |
+
|
| 932 |
+
# Check PIL availability for image files
|
| 933 |
+
if is_image_file and not GEMINI_AVAILABLE:
|
| 934 |
+
raise HTTPException(
|
| 935 |
+
status_code=500,
|
| 936 |
+
detail="Image processing requires PIL. Install: pip install Pillow"
|
| 937 |
+
)
|
| 938 |
|
| 939 |
# Check blob storage
|
| 940 |
if use_blob_storage and not get_blob_service_client():
|
|
|
|
| 950 |
|
| 951 |
# Stream upload to temp file
|
| 952 |
max_size_bytes = max_file_size_mb * 1024 * 1024
|
| 953 |
+
fd, temp_path = tempfile.mkstemp(suffix=file_extension)
|
| 954 |
os.close(fd)
|
| 955 |
|
| 956 |
doc = None
|
| 957 |
original_pdf_bytes = None
|
| 958 |
start_time = datetime.now()
|
| 959 |
+
pdf_path = temp_path
|
| 960 |
+
original_filename = file.filename
|
| 961 |
|
| 962 |
try:
|
| 963 |
print(f"\n{'='*70}")
|
| 964 |
print(f"π₯ Processing: {file.filename}")
|
| 965 |
+
print(f" File Type: {'IMAGE' if is_image_file else 'PDF'}")
|
| 966 |
print(f" Batch ID: {batch_id}")
|
| 967 |
print(
|
| 968 |
f" Performance Mode: {'Smart Sampling' if use_smart_sampling else f'Parallel ({parallel_batch_size} workers)'}")
|
|
|
|
| 982 |
file_size_mb = total_size / (1024 * 1024)
|
| 983 |
print(f"πΎ File size: {file_size_mb:.2f}MB")
|
| 984 |
|
| 985 |
+
# ============================================================================
|
| 986 |
+
# IMAGE TO PDF CONVERSION
|
| 987 |
+
# ============================================================================
|
| 988 |
+
|
| 989 |
+
if is_image_file:
|
| 990 |
+
print(f"πΌοΈ Converting image to PDF...")
|
| 991 |
+
try:
|
| 992 |
+
from PIL import Image as PILImage
|
| 993 |
+
|
| 994 |
+
# Open image and convert to PDF
|
| 995 |
+
img = PILImage.open(temp_path)
|
| 996 |
+
|
| 997 |
+
# Convert to RGB if necessary (for RGBA, grayscale, etc.)
|
| 998 |
+
if img.mode != 'RGB':
|
| 999 |
+
img = img.convert('RGB')
|
| 1000 |
+
|
| 1001 |
+
# Create PDF path
|
| 1002 |
+
pdf_path = temp_path.replace(file_extension, '.pdf')
|
| 1003 |
+
|
| 1004 |
+
# Save as PDF
|
| 1005 |
+
img.save(pdf_path, 'PDF', resolution=100.0)
|
| 1006 |
+
img.close()
|
| 1007 |
+
|
| 1008 |
+
print(f"β
Image converted to PDF")
|
| 1009 |
+
|
| 1010 |
+
# Update filename for storage
|
| 1011 |
+
file.filename = file.filename.replace(file_extension, '.pdf')
|
| 1012 |
+
|
| 1013 |
+
except Exception as e:
|
| 1014 |
+
print(f"β Image conversion failed: {e}")
|
| 1015 |
+
raise HTTPException(
|
| 1016 |
+
status_code=500,
|
| 1017 |
+
detail=f"Failed to convert image to PDF: {str(e)}"
|
| 1018 |
+
)
|
| 1019 |
+
|
| 1020 |
+
# Read PDF bytes (either original or converted)
|
| 1021 |
+
with open(pdf_path, "rb") as f:
|
| 1022 |
original_pdf_bytes = f.read()
|
| 1023 |
|
| 1024 |
# Upload original PDF to Raw folder
|
| 1025 |
raw_pdf_info = None
|
| 1026 |
if use_blob_storage:
|
| 1027 |
try:
|
| 1028 |
+
print(f"\nπ€ Uploading original {'PDF' if not is_image_file else 'converted PDF'} to Raw folder...")
|
| 1029 |
raw_pdf_info = upload_raw_pdf_to_blob(
|
| 1030 |
original_pdf_bytes,
|
| 1031 |
file.filename,
|
|
|
|
| 1037 |
print(f"β οΈ Failed to upload raw PDF: {e}")
|
| 1038 |
|
| 1039 |
# Open PDF for processing
|
| 1040 |
+
doc = fitz.open(pdf_path)
|
| 1041 |
if doc.page_count == 0:
|
| 1042 |
raise HTTPException(status_code=400, detail="Empty PDF")
|
| 1043 |
|
|
|
|
| 1230 |
# Close document
|
| 1231 |
doc.close()
|
| 1232 |
doc = None
|
| 1233 |
+
|
| 1234 |
+
# Clean up temp files
|
| 1235 |
remove_file(temp_path)
|
| 1236 |
+
if pdf_path != temp_path:
|
| 1237 |
+
remove_file(pdf_path)
|
| 1238 |
+
|
| 1239 |
gc.collect()
|
| 1240 |
|
| 1241 |
# Calculate total processing time
|
|
|
|
| 1253 |
},
|
| 1254 |
"source_file": {
|
| 1255 |
"name": file.filename,
|
| 1256 |
+
"original_name": original_filename,
|
| 1257 |
"size_mb": round(file_size_mb, 2),
|
| 1258 |
"total_pages": total_pages_count,
|
| 1259 |
"pdf_type": "image-based" if is_image_pdf else "text-based",
|
| 1260 |
+
"was_converted": is_image_file,
|
| 1261 |
"raw_pdf": raw_pdf_info
|
| 1262 |
},
|
| 1263 |
"summary": {
|
|
|
|
| 1280 |
print(f"\n{'='*70}")
|
| 1281 |
print(f"β
SUCCESS!")
|
| 1282 |
print(f" Batch ID: {batch_id}")
|
| 1283 |
+
print(f" Original File: {original_filename}")
|
| 1284 |
+
if is_image_file:
|
| 1285 |
+
print(f" β Image converted to PDF")
|
| 1286 |
print(
|
| 1287 |
f" Raw PDF: {raw_pdf_info['blob_name'] if raw_pdf_info else 'Not uploaded'}")
|
| 1288 |
print(f" Split invoices: {len(all_parts)}")
|
|
|
|
| 1305 |
if doc:
|
| 1306 |
doc.close()
|
| 1307 |
remove_file(temp_path)
|
| 1308 |
+
if pdf_path != temp_path:
|
| 1309 |
+
remove_file(pdf_path)
|
| 1310 |
gc.collect()
|
| 1311 |
|
| 1312 |
|