from fastapi import FastAPI, UploadFile, File, HTTPException from fastapi.responses import FileResponse from PyPDF2 import PdfReader, PdfWriter from typing import List, Optional import os from datetime import datetime, timedelta from uuid import uuid4 import uvicorn try: from azure.storage.blob import ( BlobServiceClient, ContentSettings, BlobSasPermissions, generate_blob_sas, ) AZURE_AVAILABLE = True except ImportError: AZURE_AVAILABLE = False app = FastAPI() OUTPUT_FOLDER = "output_pdfs" os.makedirs(OUTPUT_FOLDER, exist_ok=True) AZURE_STORAGE_CONNECTION_STRING = os.getenv( "AZURE_STORAGE_CONNECTION_STRING", "").strip() AZURE_STORAGE_ACCOUNT_NAME = os.getenv( "AZURE_STORAGE_ACCOUNT_NAME", "").strip() AZURE_STORAGE_ACCOUNT_KEY = os.getenv("AZURE_STORAGE_ACCOUNT_KEY", "").strip() AZURE_CONTAINER_NAME = os.getenv( "AZURE_CONTAINER_NAME", "invoice-splits").strip() ROOT_FOLDER = os.getenv("ROOT_FOLDER", "POD").strip() def sanitize_name(value: str) -> str: safe = os.path.basename(value or "") name_part, _ = os.path.splitext(safe) safe = name_part or "uploaded" safe = "".join(ch if ch.isalnum() or ch in ( "-", "_") else "_" for ch in safe) return safe.strip("._") or "uploaded" def get_blob_service_client(): if not AZURE_AVAILABLE: raise HTTPException( status_code=500, detail="azure-storage-blob is not installed") if AZURE_STORAGE_CONNECTION_STRING: return BlobServiceClient.from_connection_string(AZURE_STORAGE_CONNECTION_STRING) if AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY: account_url = f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net" return BlobServiceClient(account_url=account_url, credential=AZURE_STORAGE_ACCOUNT_KEY) raise HTTPException( status_code=500, detail="Azure storage credentials are not configured") def ensure_container_exists(blob_service_client): container_client = blob_service_client.get_container_client( AZURE_CONTAINER_NAME) if not container_client.exists(): container_client.create_container() def build_blob_path(parts) -> str: return "/".join(str(p).strip("/") for p in parts if str(p).strip("/")).replace("\\", "/") def create_blob_url(blob_name: str) -> str: base_url = ( f"https://{AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net/" f"{AZURE_CONTAINER_NAME}/{blob_name}" ) if AZURE_STORAGE_ACCOUNT_NAME and AZURE_STORAGE_ACCOUNT_KEY: sas_token = generate_blob_sas( account_name=AZURE_STORAGE_ACCOUNT_NAME, account_key=AZURE_STORAGE_ACCOUNT_KEY, container_name=AZURE_CONTAINER_NAME, blob_name=blob_name, permission=BlobSasPermissions(read=True), expiry=datetime.utcnow() + timedelta(days=7), ) return f"{base_url}?{sas_token}" return base_url def upload_pdf_to_blob(blob_service_client, local_path: str, blob_name: str) -> str: blob_client = blob_service_client.get_blob_client( container=AZURE_CONTAINER_NAME, blob=blob_name, ) with open(local_path, "rb") as file_data: blob_client.upload_blob( file_data, overwrite=True, content_settings=ContentSettings(content_type="application/pdf"), ) return create_blob_url(blob_name) def split_pdf(input_path, output_dir, pages_per_split=30): reader = PdfReader(input_path) total_pages = len(reader.pages) blocks = [] split_index = 1 for start in range(0, total_pages, pages_per_split): writer = PdfWriter() end = min(start + pages_per_split, total_pages) for page_num in range(start, end): writer.add_page(reader.pages[page_num]) split_label = f"Split_{split_index}" split_folder = os.path.join(output_dir, split_label) split_raw_folder = os.path.join(split_folder, "Raw") split_invoices_folder = os.path.join(split_folder, "Invoices") os.makedirs(split_raw_folder, exist_ok=True) os.makedirs(split_invoices_folder, exist_ok=True) filename = f"{split_label}_{start+1}_to_{end}.pdf" output_path = os.path.join(split_raw_folder, filename) with open(output_path, "wb") as f: writer.write(f) blocks.append({ "split_id": split_label, "split_index": split_index, "start_page": start + 1, "end_page": end, "filename": filename, "local_path": output_path, "local_split_folder": split_folder, "local_raw_folder": split_raw_folder, "local_invoices_folder": split_invoices_folder, }) split_index += 1 return blocks @app.post("/split-pdf") async def split_pdf_api( files: Optional[List[UploadFile]] = File(None), file: Optional[List[UploadFile]] = File(None), ): job_id = f"{datetime.utcnow().strftime('%Y%m%d%H%M%S')}_{uuid4().hex[:8]}" uploaded_files = [] if files: uploaded_files.extend(files) if file: uploaded_files.extend(file) if not uploaded_files: raise HTTPException(status_code=400, detail="No files uploaded") blob_service_client = get_blob_service_client() ensure_container_exists(blob_service_client) response_files = [] split_files_for_invoice_service = [] used_file_stems = set() batch_split_counter = 1 for upload in uploaded_files: safe_filename = os.path.basename(upload.filename or "uploaded.pdf") base_stem = sanitize_name(safe_filename) safe_file_stem = base_stem suffix = 1 while safe_file_stem in used_file_stems: suffix += 1 safe_file_stem = f"{base_stem}_{suffix}" used_file_stems.add(safe_file_stem) # Requested hierarchy: # ROOT_FOLDER / {batch_id} / FileName / Raw # ROOT_FOLDER / {batch_id} / FileName / Splitted / Split_n / Raw # ROOT_FOLDER / {batch_id} / FileName / Splitted / Split_n / Invoices batch_local_base = os.path.join( OUTPUT_FOLDER, job_id, safe_file_stem) raw_job_folder = os.path.join(batch_local_base, "Raw") split_job_folder = os.path.join(batch_local_base, "Splitted") os.makedirs(raw_job_folder, exist_ok=True) os.makedirs(split_job_folder, exist_ok=True) input_path = os.path.join(raw_job_folder, safe_filename) with open(input_path, "wb") as f: f.write(await upload.read()) blocks = split_pdf(input_path, split_job_folder) raw_blob_name = build_blob_path([ ROOT_FOLDER, job_id, safe_file_stem, "Raw", safe_filename, ]) raw_blob_url = upload_pdf_to_blob( blob_service_client, input_path, raw_blob_name) split_files = [] for block in blocks: unique_split_id = f"Split_{batch_split_counter}" batch_split_counter += 1 split_blob_name = build_blob_path([ ROOT_FOLDER, job_id, safe_file_stem, "Splitted", unique_split_id, "Raw", block["filename"], ]) split_invoices_blob_folder = build_blob_path([ ROOT_FOLDER, job_id, safe_file_stem, "Splitted", unique_split_id, "Invoices", ]) split_blob_url = upload_pdf_to_blob( blob_service_client, block["local_path"], split_blob_name, ) split_files_for_invoice_service.append({ "batch_id": job_id, "file_name": safe_file_stem, "split_id": unique_split_id, "split_raw_blob_path": split_blob_name, "split_raw_url": split_blob_url, "target_invoices_blob_folder": split_invoices_blob_folder, "app3_form_data": { "batch_id": job_id, "target_invoices_blob_folder": split_invoices_blob_folder, "split_id": unique_split_id, "file_name": safe_file_stem, }, }) split_files.append({ "split_id": unique_split_id, "pages": f"{block['start_page']}-{block['end_page']}", "Raw": { "filename": block["filename"], "blob_path": split_blob_name, "url": split_blob_url, }, "Invoices": [], }) response_files.append({ "file_name": safe_file_stem, "Raw": { "original_file": { "filename": safe_filename, "blob_path": raw_blob_name, "url": raw_blob_url, } }, "Splitted": split_files, }) return { "blob_folder": ROOT_FOLDER, "batch_id": job_id, "files": response_files, "next_service": { "service": "splitpdffile (2).py", "handoff": split_files_for_invoice_service, }, } @app.get("/pdfs/{job_id}/{filename}") def get_pdf(job_id: str, filename: str): file_path = os.path.join(OUTPUT_FOLDER, job_id) for root, _, files in os.walk(file_path): if filename in files: return FileResponse(os.path.join(root, filename)) raise HTTPException(status_code=404, detail="File not found") @app.get("/raw-pdfs/{job_id}/{filename}") def get_raw_pdf(job_id: str, filename: str): file_path = os.path.join(OUTPUT_FOLDER, job_id) for root, _, files in os.walk(file_path): if os.path.basename(root).lower() == "raw" and filename in files: return FileResponse(os.path.join(root, filename)) raise HTTPException(status_code=404, detail="File not found") if __name__ == "__main__": uvicorn.run("main:app", host="127.0.0.1", port=8001, reload=False)