anujakkulkarni commited on
Commit
c07129b
·
verified ·
1 Parent(s): a072bcb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -50
app.py CHANGED
@@ -1,14 +1,15 @@
1
- from fastapi import FastAPI, File, UploadFile, Form
2
- from fastapi.responses import JSONResponse
3
- from fastapi.middleware.cors import CORSMiddleware
4
- import fitz # PyMuPDF
5
  import io
6
  import re
7
  import base64
 
 
 
 
 
 
8
 
9
  app = FastAPI(title="Invoice Splitter API")
10
 
11
- # Allow CORS (optional but helpful for Flutter/JS frontend)
12
  app.add_middleware(
13
  CORSMiddleware,
14
  allow_origins=["*"],
@@ -17,59 +18,118 @@ app.add_middleware(
17
  allow_headers=["*"],
18
  )
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  @app.post("/split-invoices")
21
  async def split_invoices(
22
  file: UploadFile = File(...),
23
  include_pdf: bool = Form(True),
24
- initial_dpi: int = Form(300)
25
  ):
 
 
 
 
 
 
 
26
  try:
27
- pdf_data = await file.read()
28
- pdf = fitz.open(stream=pdf_data, filetype="pdf")
29
-
30
- invoice_pattern = re.compile(r"\b[A-Z0-9]{3,10}\b") # Example pattern
31
- splits = []
32
- current_invoice = None
33
- current_pages = []
34
-
35
- for page_num, page in enumerate(pdf, start=1):
36
- text = page.get_text("text")
37
-
38
- match = re.search(r"Invoice\s*No[:\s\-]*([A-Z0-9]+)", text, re.I)
39
- if match:
40
- invoice_no = match.group(1)
41
- if current_invoice:
42
- splits.append({
 
 
 
 
 
 
43
  "invoice_no": current_invoice,
44
- "pages": current_pages.copy()
45
  })
46
- current_pages.clear()
47
- current_invoice = invoice_no
48
-
49
- current_pages.append(page_num)
50
-
51
- if current_invoice and current_pages:
52
- splits.append({"invoice_no": current_invoice, "pages": current_pages})
53
-
54
- results = []
55
- for split in splits:
56
- doc = fitz.open()
57
- for pno in split["pages"]:
58
- doc.insert_pdf(pdf, from_page=pno-1, to_page=pno-1)
59
- pdf_bytes = doc.tobytes()
60
- base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8') if include_pdf else None
61
-
62
- results.append({
63
- "invoice_no": split["invoice_no"],
64
- "num_pages": len(split["pages"]),
65
- "pages": split["pages"],
66
- "pdf_base64": base64_pdf
67
- })
68
-
69
- return JSONResponse({
70
- "count": len(results),
71
- "parts": results
72
- })
73
 
 
 
74
  except Exception as e:
75
  return JSONResponse({"error": str(e)}, status_code=500)
 
 
 
 
 
1
  import io
2
  import re
3
  import base64
4
+ from typing import List, Dict, Optional
5
+
6
+ from fastapi import FastAPI, File, UploadFile, Form, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from fastapi.responses import JSONResponse
9
+ import fitz # PyMuPDF
10
 
11
  app = FastAPI(title="Invoice Splitter API")
12
 
 
13
  app.add_middleware(
14
  CORSMiddleware,
15
  allow_origins=["*"],
 
18
  allow_headers=["*"],
19
  )
20
 
21
+ # Same robust pattern you used in Flask:
22
+ INVOICE_NO_RE = re.compile(
23
+ r"(?:Inv\s*No\.?|Invoice\s*No\.?)\s*[:\-]?\s*([A-Za-z0-9\-\/]+)",
24
+ re.IGNORECASE
25
+ )
26
+
27
+ def extract_invoice_no_from_page(page: fitz.Page) -> Optional[str]:
28
+ """
29
+ Extract invoice number from a page by checking full text first,
30
+ then falling back to block-level text (like your Flask code).
31
+ """
32
+ # 1) Full page text
33
+ text = page.get_text("text") or ""
34
+ m = INVOICE_NO_RE.search(text)
35
+ if not m:
36
+ # 2) Block-level fallback (handles layout/line breaks better)
37
+ for block in (page.get_text("blocks") or []):
38
+ # PyMuPDF "blocks" entries are tuples; the 5th item is the text
39
+ block_text = block[4] if len(block) > 4 else ""
40
+ m = INVOICE_NO_RE.search(block_text or "")
41
+ if m:
42
+ break
43
+
44
+ if not m:
45
+ return None
46
+
47
+ inv = (m.group(1) or "").strip()
48
+
49
+ # Guard against false positives like "Invoice"
50
+ if not inv or inv.lower() == "invoice":
51
+ return None
52
+
53
+ return inv
54
+
55
+ def build_pdf_from_pages(src_doc: fitz.Document, page_indices: List[int]) -> bytes:
56
+ """Create a new PDF with the given pages (0-based indices)."""
57
+ out = fitz.open()
58
+ for i in page_indices:
59
+ # Note: insert_pdf uses from_page/to_page, not "pages" kwarg.
60
+ out.insert_pdf(src_doc, from_page=i, to_page=i)
61
+ pdf_bytes = out.tobytes()
62
+ out.close()
63
+ return pdf_bytes
64
+
65
  @app.post("/split-invoices")
66
  async def split_invoices(
67
  file: UploadFile = File(...),
68
  include_pdf: bool = Form(True),
69
+ initial_dpi: int = Form(300), # kept for compatibility; not used here
70
  ):
71
+ if not file.filename.lower().endswith(".pdf"):
72
+ raise HTTPException(status_code=400, detail="only PDF is supported")
73
+
74
+ file_bytes = await file.read()
75
+ if not file_bytes:
76
+ raise HTTPException(status_code=400, detail="empty file")
77
+
78
  try:
79
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
80
+ if doc.page_count == 0:
81
+ raise HTTPException(status_code=400, detail="no pages found")
82
+
83
+ # Extract invoice number per page (0-based)
84
+ page_invoice_nos: List[Optional[str]] = []
85
+ for i in range(doc.page_count):
86
+ inv = extract_invoice_no_from_page(doc.load_page(i))
87
+ page_invoice_nos.append(inv)
88
+
89
+ # Group pages: start a new group when a NEW non-None invoice number appears
90
+ groups: List[Dict] = []
91
+ current_group_pages: List[int] = []
92
+ current_invoice: Optional[str] = None
93
+
94
+ for idx, inv in enumerate(page_invoice_nos):
95
+ if current_invoice is None:
96
+ current_invoice = inv
97
+ current_group_pages = [idx]
98
+ else:
99
+ if inv is not None and inv != current_invoice:
100
+ groups.append({
101
  "invoice_no": current_invoice,
102
+ "pages": current_group_pages[:],
103
  })
104
+ current_invoice = inv
105
+ current_group_pages = [idx]
106
+ else:
107
+ current_group_pages.append(idx)
108
+
109
+ if current_group_pages:
110
+ groups.append({"invoice_no": current_invoice, "pages": current_group_pages[:]})
111
+
112
+ # If we never found any invoice numbers, return the whole doc as one part
113
+ if all(g["invoice_no"] is None for g in groups):
114
+ groups = [{"invoice_no": None, "pages": list(range(doc.page_count))}]
115
+
116
+ parts = []
117
+ for g in groups:
118
+ part_bytes = build_pdf_from_pages(doc, g["pages"])
119
+ info = {
120
+ "invoice_no": g["invoice_no"],
121
+ "pages": [p + 1 for p in g["pages"]], # 1-based for humans
122
+ "num_pages": len(g["pages"]),
123
+ "size_bytes": len(part_bytes),
124
+ }
125
+ if include_pdf:
126
+ info["pdf_base64"] = base64.b64encode(part_bytes).decode("ascii")
127
+ parts.append(info)
128
+
129
+ doc.close()
130
+ return JSONResponse({"count": len(parts), "parts": parts})
131
 
132
+ except HTTPException:
133
+ raise
134
  except Exception as e:
135
  return JSONResponse({"error": str(e)}, status_code=500)