anujakkulkarni commited on
Commit
1ddf149
·
verified ·
1 Parent(s): 0b694fe

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -0
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, Form
2
+ from fastapi.responses import JSONResponse
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ import fitz # PyMuPDF
5
+ import io
6
+ import re
7
+ import base64
8
+
9
+ app = FastAPI(title="Invoice Splitter API")
10
+
11
+ # Allow CORS (optional but helpful for Flutter/JS frontend)
12
+ app.add_middleware(
13
+ CORSMiddleware,
14
+ allow_origins=["*"],
15
+ allow_credentials=True,
16
+ allow_methods=["*"],
17
+ allow_headers=["*"],
18
+ )
19
+
20
+ @app.post("/split-invoices")
21
+ async def split_invoices(
22
+ file: UploadFile = File(...),
23
+ include_pdf: bool = Form(True),
24
+ initial_dpi: int = Form(300)
25
+ ):
26
+ try:
27
+ pdf_data = await file.read()
28
+ pdf = fitz.open(stream=pdf_data, filetype="pdf")
29
+
30
+ invoice_pattern = re.compile(r"\b[A-Z0-9]{3,10}\b") # Example pattern
31
+ splits = []
32
+ current_invoice = None
33
+ current_pages = []
34
+
35
+ for page_num, page in enumerate(pdf, start=1):
36
+ text = page.get_text("text")
37
+
38
+ match = re.search(r"Invoice\s*No[:\s\-]*([A-Z0-9]+)", text, re.I)
39
+ if match:
40
+ invoice_no = match.group(1)
41
+ if current_invoice:
42
+ splits.append({
43
+ "invoice_no": current_invoice,
44
+ "pages": current_pages.copy()
45
+ })
46
+ current_pages.clear()
47
+ current_invoice = invoice_no
48
+
49
+ current_pages.append(page_num)
50
+
51
+ if current_invoice and current_pages:
52
+ splits.append({"invoice_no": current_invoice, "pages": current_pages})
53
+
54
+ results = []
55
+ for split in splits:
56
+ doc = fitz.open()
57
+ for pno in split["pages"]:
58
+ doc.insert_pdf(pdf, from_page=pno-1, to_page=pno-1)
59
+ pdf_bytes = doc.tobytes()
60
+ base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8') if include_pdf else None
61
+
62
+ results.append({
63
+ "invoice_no": split["invoice_no"],
64
+ "num_pages": len(split["pages"]),
65
+ "pages": split["pages"],
66
+ "pdf_base64": base64_pdf
67
+ })
68
+
69
+ return JSONResponse({
70
+ "count": len(results),
71
+ "parts": results
72
+ })
73
+
74
+ except Exception as e:
75
+ return JSONResponse({"error": str(e)}, status_code=500)