rairo commited on
Commit
a7f661b
·
verified ·
1 Parent(s): 2a04067

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +21 -5
main.py CHANGED
@@ -205,28 +205,44 @@ def process_with_gemini(model, text):
205
  return resp.text
206
  raise
207
 
208
- def process_pdf_pages(model, pdf_file):
 
 
 
 
209
  reader, total_pages = read_pdf_pages(pdf_file)
210
  all_txns = []
 
211
  for pg in range(total_pages):
212
  txt = extract_page_text(reader, pg).strip()
213
  if not txt:
214
  continue
 
 
215
  try:
216
- raw = process_with_gemini(model, txt)
217
  except Exception:
 
218
  continue
219
- # grab the JSON blob
 
220
  start = raw.find("{")
221
  end = raw.rfind("}") + 1
222
  if start < 0 or end <= 0:
223
  continue
 
 
224
  js = raw[start:end].replace("```json", "").replace("```", "")
225
  try:
226
  data = json.loads(js)
227
- all_txns.extend(data.get("transactions", []))
228
  except json.JSONDecodeError:
229
  continue
 
 
 
 
 
 
230
  return all_txns
231
 
232
  # --------- Chat Endpoint ---------
@@ -275,7 +291,7 @@ def upload_statements():
275
  f.seek(0)
276
 
277
  # extract + store transactions
278
- txns = process_pdf_pages(model, f)
279
  for txn in txns:
280
  try:
281
  dt = datetime.strptime(txn["Date"], "%d/%m/%Y")
 
205
  return resp.text
206
  raise
207
 
208
+ def process_pdf_pages(pdf_file):
209
+ """
210
+ Reads each page of the given PDF file, sends it through Gemini,
211
+ extracts the JSON “transactions” array, and returns the full list.
212
+ """
213
  reader, total_pages = read_pdf_pages(pdf_file)
214
  all_txns = []
215
+
216
  for pg in range(total_pages):
217
  txt = extract_page_text(reader, pg).strip()
218
  if not txt:
219
  continue
220
+
221
+ # 1) Call Gemini
222
  try:
223
+ raw = process_with_gemini(txt)
224
  except Exception:
225
+ # Skip this page on any error (including retries inside process_with_gemini)
226
  continue
227
+
228
+ # 2) Locate the JSON payload
229
  start = raw.find("{")
230
  end = raw.rfind("}") + 1
231
  if start < 0 or end <= 0:
232
  continue
233
+
234
+ # 3) Clean up any markdown fences and parse
235
  js = raw[start:end].replace("```json", "").replace("```", "")
236
  try:
237
  data = json.loads(js)
 
238
  except json.JSONDecodeError:
239
  continue
240
+
241
+ # 4) Append all found transactions
242
+ txns = data.get("transactions", [])
243
+ if isinstance(txns, list):
244
+ all_txns.extend(txns)
245
+
246
  return all_txns
247
 
248
  # --------- Chat Endpoint ---------
 
291
  f.seek(0)
292
 
293
  # extract + store transactions
294
+ txns = process_pdf_pages(f)
295
  for txn in txns:
296
  try:
297
  dt = datetime.strptime(txn["Date"], "%d/%m/%Y")