Spaces:

anujakkulkarni
/

splitpdffile

Sleeping

App Files Files Community

anujakkulkarni commited on Jan 13

Commit

ea68370

verified ·

1 Parent(s): b8cd992

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -19

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ from collections import deque
 from pathlib import Path
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
-from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.requests import Request
 import fitz  # PyMuPDF
@@ -22,7 +22,7 @@ try:
     import google.generativeai as genai
     from PIL import Image
     GEMINI_AVAILABLE = True
-except ImportError:
     GEMINI_AVAILABLE = False
     print("Warning: google-generativeai not installed.  Image-based PDFs won't be supported.")
@@ -113,7 +113,7 @@ def check_daily_quota():
     global last_quota_reset, daily_quota_exhausted
     now = datetime.datetime.now()
-    if last_quota_reset is None:
         last_quota_reset = now
         daily_quota_exhausted = False
         return True
@@ -185,10 +185,10 @@ def reset_to_primary_model():
 # --- Regex Patterns ---
 INVOICE_NO_RE = re.compile(
-    r"""(?: Invoice\s*No\. ?|Inv\.  ?\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})""",
     re.IGNORECASE | re.VERBOSE
 )
-PREFIXED_INVOICE_RE = re.compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
 GST_LIKE_RE = re.compile(r"\b((?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
@@ -204,14 +204,14 @@ def is_image_based_pdf(doc: fitz.Document, sample_pages: int = 3) -> Tuple[bool,
 # --- Extraction Logic ---
 def normalize_text_for_search(s: str) -> str:
-    if not s:
         return s
     s = s.replace("\u00A0", " ")
     return re.sub(r"[ ]{2,}", " ", re.sub(r"[\r\n\t]+", " ", s)).strip()
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
-    if not text:
         return None
     text_norm = normalize_text_for_search(text)
@@ -237,10 +237,10 @@ def try_extract_invoice_from_text(text: str) -> Optional[str]:
 def extract_invoice_gemini(page:  fitz.Page, retry_count=0) -> Optional[str]:
-    if not check_daily_quota():
         return None
     model = get_gemini_model()
-    if not model:
         return None
     if not gemini_rate_limiter.allow_request():
@@ -251,7 +251,7 @@ def extract_invoice_gemini(page:  fitz.Page, retry_count=0) -> Optional[str]:
     try:
         # ⭐ Reduced resolution from 2x to 1.5x to save memory
-        pix = page.get_pixmap(matrix=fitz.Matrix(1. 5, 1.5), dpi=150)
         img_bytes = pix.tobytes("png")
         # ⭐ Explicitly free pixmap memory
@@ -299,14 +299,14 @@ def extract_invoice_no_from_page(page: fitz.Page, is_image_pdf: bool) -> Optiona
     # 1. Try Text Extraction (Fastest)
     text = page.get_text("text") or ""
     inv = try_extract_invoice_from_text(text)
-    if inv:
         return inv
     # 2. Try Block Extraction
     for block in (page.get_text("blocks") or []):
         if len(block) > 4 and block[4]:
             inv = try_extract_invoice_from_text(block[4])
-            if inv:
                 return inv
     # 3. Gemini Fallback (Only if enabled and seemingly image-based)
@@ -444,7 +444,7 @@ async def split_invoices(
         for i in range(doc. page_count):
             # ⭐ Progress logging for large documents
             if i > 0 and i % 50 == 0:
-                print(f"   📄 Processed {i}/{doc.page_count} pages")
             page = doc. load_page(i)
@@ -558,7 +558,7 @@ async def split_invoices(
             }
         })
-    except HTTPException:
         raise  # Re-raise HTTP exceptions as-is
     except Exception as e:
@@ -574,7 +574,7 @@ async def split_invoices(
                 doc.close()
                 print("📕 Closed PDF document")
             except Exception as e:
-                print(f"⚠️ Error closing document: {e}")
         # Delete temp file
         remove_file(temp_path)
@@ -590,7 +590,7 @@ async def split_invoices_stream(
     max_file_size_mb: int = Form(200)
 ):
     """
-    Streaming version for extremely large files.
     Returns NDJSON (newline-delimited JSON) with each part as a separate line.
     This avoids building a large JSON response in memory.
@@ -638,7 +638,7 @@ async def split_invoices_stream(
             # Extract invoice numbers
             page_invoice_nos = []
             for i in range(doc.page_count):
-                page = doc. load_page(i)
                 inv = extract_invoice_no_from_page(page, is_image_pdf)
                 page_invoice_nos.append(inv)
                 page = None
@@ -701,7 +701,7 @@ async def split_invoices_stream(
                 "error": str(e)
             }) + "\n"
         finally:
-            if doc:
                 doc.close()
             remove_file(temp_path)
             gc.collect()
@@ -710,7 +710,7 @@ async def split_invoices_stream(
         generate_parts(),
         media_type="application/x-ndjson",
         headers={
-            "Content-Disposition": f"attachment; filename=invoices-split. ndjson"
         }
     )

 from pathlib import Path
 from fastapi import FastAPI, File, UploadFile, Form, HTTPException, BackgroundTasks
+from fastapi. middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
 from starlette.requests import Request
 import fitz  # PyMuPDF
     import google.generativeai as genai
     from PIL import Image
     GEMINI_AVAILABLE = True
+except ImportError:
     GEMINI_AVAILABLE = False
     print("Warning: google-generativeai not installed.  Image-based PDFs won't be supported.")
     global last_quota_reset, daily_quota_exhausted
     now = datetime.datetime.now()
+    if last_quota_reset is None:
         last_quota_reset = now
         daily_quota_exhausted = False
         return True
 # --- Regex Patterns ---
 INVOICE_NO_RE = re.compile(
+    r"""(?: Invoice\s*No\.?|Inv\.\s*No\.?|Bill\s*No\.?|Document\s*No\.?|Doc\s*No\.?|Tax\s*Invoice\s*No\.?)\s*[:\-]?\s*([A-Z0-9][A-Z0-9\-\/]{3,})""",
     re.IGNORECASE | re.VERBOSE
 )
+PREFIXED_INVOICE_RE = re. compile(r"\b([A-Z]{2,4}[-/]\d{4,}(?:/\d+)?[A-Z]*)\b")
 GST_LIKE_RE = re.compile(r"\b((?: GSTIN|GST\s*No\.?|GST\s*IN|GST)[\s:\-]*([0-9A-Z]{15}))\b", re.IGNORECASE)
 # --- Extraction Logic ---
 def normalize_text_for_search(s: str) -> str:
+    if not s:
         return s
     s = s.replace("\u00A0", " ")
     return re.sub(r"[ ]{2,}", " ", re.sub(r"[\r\n\t]+", " ", s)).strip()
 def try_extract_invoice_from_text(text: str) -> Optional[str]:
+    if not text:
         return None
     text_norm = normalize_text_for_search(text)
 def extract_invoice_gemini(page:  fitz.Page, retry_count=0) -> Optional[str]:
+    if not check_daily_quota():
         return None
     model = get_gemini_model()
+    if not model:
         return None
     if not gemini_rate_limiter.allow_request():
     try:
         # ⭐ Reduced resolution from 2x to 1.5x to save memory
+        pix = page.get_pixmap(matrix=fitz.Matrix(1.5, 1.5), dpi=150)
         img_bytes = pix.tobytes("png")
         # ⭐ Explicitly free pixmap memory
     # 1. Try Text Extraction (Fastest)
     text = page.get_text("text") or ""
     inv = try_extract_invoice_from_text(text)
+    if inv:
         return inv
     # 2. Try Block Extraction
     for block in (page.get_text("blocks") or []):
         if len(block) > 4 and block[4]:
             inv = try_extract_invoice_from_text(block[4])
+            if inv:
                 return inv
     # 3. Gemini Fallback (Only if enabled and seemingly image-based)
         for i in range(doc. page_count):
             # ⭐ Progress logging for large documents
             if i > 0 and i % 50 == 0:
+                print(f"   ��� Processed {i}/{doc.page_count} pages")
             page = doc. load_page(i)
             }
         })
+    except HTTPException:
         raise  # Re-raise HTTP exceptions as-is
     except Exception as e:
                 doc.close()
                 print("📕 Closed PDF document")
             except Exception as e:
+                print(f"⚠️ Error closing document:  {e}")
         # Delete temp file
         remove_file(temp_path)
     max_file_size_mb: int = Form(200)
 ):
     """
+    Streaming version for extremely large files.
     Returns NDJSON (newline-delimited JSON) with each part as a separate line.
     This avoids building a large JSON response in memory.
             # Extract invoice numbers
             page_invoice_nos = []
             for i in range(doc.page_count):
+                page = doc.load_page(i)
                 inv = extract_invoice_no_from_page(page, is_image_pdf)
                 page_invoice_nos.append(inv)
                 page = None
                 "error": str(e)
             }) + "\n"
         finally:
+            if doc:
                 doc.close()
             remove_file(temp_path)
             gc.collect()
         generate_parts(),
         media_type="application/x-ndjson",
         headers={
+            "Content-Disposition":  f"attachment; filename=invoices-split. ndjson"
         }
     )