self-search-api

Sleeping

rairo commited on May 2, 2025

Commit

a7f661b

verified ·

1 Parent(s): 2a04067

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -205,28 +205,44 @@ def process_with_gemini(model, text):
             return resp.text
         raise
-def process_pdf_pages(model, pdf_file):
     reader, total_pages = read_pdf_pages(pdf_file)
     all_txns = []
     for pg in range(total_pages):
         txt = extract_page_text(reader, pg).strip()
         if not txt:
             continue
         try:
-            raw = process_with_gemini(model, txt)
         except Exception:
             continue
-        # grab the JSON blob
         start = raw.find("{")
         end   = raw.rfind("}") + 1
         if start < 0 or end <= 0:
             continue
         js = raw[start:end].replace("```json", "").replace("```", "")
         try:
             data = json.loads(js)
-            all_txns.extend(data.get("transactions", []))
         except json.JSONDecodeError:
             continue
     return all_txns
 # --------- Chat Endpoint ---------
@@ -275,7 +291,7 @@ def upload_statements():
         f.seek(0)
         # extract + store transactions
-        txns = process_pdf_pages(model, f)
         for txn in txns:
             try:
                 dt = datetime.strptime(txn["Date"], "%d/%m/%Y")

             return resp.text
         raise
+def process_pdf_pages(pdf_file):
+    """
+    Reads each page of the given PDF file, sends it through Gemini,
+    extracts the JSON “transactions” array, and returns the full list.
+    """
     reader, total_pages = read_pdf_pages(pdf_file)
     all_txns = []
     for pg in range(total_pages):
         txt = extract_page_text(reader, pg).strip()
         if not txt:
             continue
+        # 1) Call Gemini
         try:
+            raw = process_with_gemini(txt)
         except Exception:
+            # Skip this page on any error (including retries inside process_with_gemini)
             continue
+        # 2) Locate the JSON payload
         start = raw.find("{")
         end   = raw.rfind("}") + 1
         if start < 0 or end <= 0:
             continue
+        # 3) Clean up any markdown fences and parse
         js = raw[start:end].replace("```json", "").replace("```", "")
         try:
             data = json.loads(js)
         except json.JSONDecodeError:
             continue
+        # 4) Append all found transactions
+        txns = data.get("transactions", [])
+        if isinstance(txns, list):
+            all_txns.extend(txns)
     return all_txns
 # --------- Chat Endpoint ---------
         f.seek(0)
         # extract + store transactions
+        txns = process_pdf_pages(f)
         for txn in txns:
             try:
                 dt = datetime.strptime(txn["Date"], "%d/%m/%Y")