rairo commited on
Commit
9937c2b
·
verified ·
1 Parent(s): 3a70f45

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +129 -497
main.py CHANGED
@@ -6,7 +6,6 @@ import tempfile
6
  import time
7
  from datetime import datetime
8
  from io import BytesIO
9
- from typing import List, Dict, Any, Tuple
10
 
11
  # Third-party imports
12
  from flask import Flask, request, jsonify
@@ -35,39 +34,11 @@ api_key = os.getenv('Gemini')
35
  if not api_key:
36
  logging.warning("Gemini API key not found in environment variables.")
37
 
38
- # -------------------------------------------------------------------------
39
- # CONSTANTS
40
- # -------------------------------------------------------------------------
41
-
42
- MODEL_NAME = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
43
- MAX_TEXT_CHUNK_CHARS = int(os.getenv("MAX_TEXT_CHUNK_CHARS", "12000"))
44
- MAX_TEXT_CHUNK_LINES = int(os.getenv("MAX_TEXT_CHUNK_LINES", "120"))
45
- GEMINI_RETRIES = int(os.getenv("GEMINI_RETRIES", "3"))
46
- RETRY_BASE_SLEEP_SECONDS = float(os.getenv("RETRY_BASE_SLEEP_SECONDS", "2"))
47
- TEXT_MIN_MEANINGFUL_LENGTH = int(os.getenv("TEXT_MIN_MEANINGFUL_LENGTH", "80"))
48
-
49
- # -------------------------------------------------------------------------
50
- # CUSTOM EXCEPTIONS
51
- # -------------------------------------------------------------------------
52
-
53
- class GeminiTransientError(Exception):
54
- pass
55
-
56
- class GeminiTimeoutError(GeminiTransientError):
57
- pass
58
-
59
- class GeminiFatalError(Exception):
60
- pass
61
-
62
- # -------------------------------------------------------------------------
63
- # GEMINI SETUP
64
- # -------------------------------------------------------------------------
65
-
66
  def configure_gemini(api_key):
67
  """Configure Gemini AI model."""
68
  try:
69
  genai.configure(api_key=api_key)
70
- return genai.GenerativeModel(MODEL_NAME)
71
  except Exception as e:
72
  logging.error(f"Error configuring Gemini: {str(e)}")
73
  raise
@@ -85,9 +56,6 @@ RULES:
85
  - Do NOT use the current date (today) unless the document explicitly says "Today".
86
  2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND.
87
  3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals.
88
- 4. For bank statements, focus on transaction rows only.
89
- 5. Do not duplicate the same transaction.
90
- 6. If a chunk appears to contain partial rows only, extract only rows that are sufficiently complete.
91
 
92
  FIELDS TO EXTRACT:
93
  - Date: string (DD/MM/YYYY)
@@ -168,7 +136,7 @@ def get_text_prompt_with_fallback_date():
168
  """
169
  current_date = datetime.now().strftime("%d/%m/%Y")
170
  return f"""IMPORTANT: Today's date is {current_date}.
171
- If the text below does not specify a year or date, reasonably assume {current_date} context, but prefer explicit dates in text.
172
 
173
  {FINANCIAL_DOC_PROMPT}
174
  """
@@ -181,9 +149,9 @@ def categorize_transaction(transaction):
181
  """
182
  Categorizes a transaction based strictly on its Type field.
183
  """
184
- tx_type = str(transaction.get('Type', '')).lower()
185
- description = str(transaction.get('Description', '')).lower()
186
- destination = str(transaction.get('Destination_of_funds', '')).lower()
187
 
188
  account_category = "Uncategorized"
189
 
@@ -296,12 +264,9 @@ def categorize_transaction(transaction):
296
  # HELPER FUNCTIONS
297
  # -------------------------------------------------------------------------
298
 
299
- def normalize_whitespace(text: str) -> str:
300
- return re.sub(r"[ \t]+", " ", text or "").strip()
301
-
302
  def extract_json_from_response(response_text):
303
  """Extract valid JSON from Gemini's response, handling Markdown fences."""
304
- cleaned_text = re.sub(r'```json\s*', '', response_text or "", flags=re.IGNORECASE)
305
  cleaned_text = re.sub(r'```\s*', '', cleaned_text)
306
 
307
  match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
@@ -327,109 +292,75 @@ Broken JSON: {broken_json_string}"""
327
  logging.error(f"JSON repair failed: {e}")
328
  return {fallback_key: []}
329
 
330
- def is_timeout_error(exc: Exception) -> bool:
331
- msg = str(exc).lower()
332
- timeout_markers = [
333
- "504",
334
- "timed out",
335
- "timeout",
336
- "deadlineexceeded",
337
- "deadline exceeded",
338
- "gateway timeout",
339
- "upstream request timeout",
340
- ]
341
- return any(marker in msg for marker in timeout_markers)
342
-
343
- def is_retryable_error(exc: Exception) -> bool:
344
- msg = str(exc).lower()
345
- retryable_markers = [
346
- "429",
347
- "resourceexhausted",
348
- "unavailable",
349
- "503",
350
- "500",
351
- "internal",
352
- "connection reset",
353
- "temporarily unavailable",
354
- "service unavailable",
355
- "rate limit",
356
- ]
357
- return is_timeout_error(exc) or any(marker in msg for marker in retryable_markers)
358
 
359
- def sleep_for_retry(attempt: int):
360
- delay = RETRY_BASE_SLEEP_SECONDS * (attempt + 1)
361
- time.sleep(delay)
362
 
363
- def post_process_financial_result(result: Dict[str, Any]) -> Dict[str, Any]:
364
- if 'transactions' in result and isinstance(result['transactions'], list):
365
- result['transactions'] = [categorize_transaction(tx) for tx in result['transactions']]
366
- else:
367
- result['transactions'] = []
368
- return result
369
 
370
- def call_gemini_with_retry(model, content, prompt, retries=GEMINI_RETRIES):
 
 
 
 
371
  """
372
- Generic runner for financial Gemini extraction.
373
- Retries transient failures including 504s.
374
- Raises GeminiTimeoutError specifically so caller can fall back to vision.
375
  """
376
- last_exc = None
377
-
378
  for attempt in range(retries + 1):
379
  try:
380
  response = model.generate_content(
381
  [prompt, content],
382
- generation_config={
383
- "temperature": 0,
384
- "response_mime_type": "application/json",
385
- }
386
  )
387
 
388
  try:
389
  result = extract_json_from_response(response.text)
390
- return post_process_financial_result(result)
 
 
 
 
 
391
  except ValueError as ve:
392
- broken_json = str(ve)
393
- repaired = repair_json_with_gemini(model, broken_json, fallback_key="transactions")
394
- return post_process_financial_result(repaired)
 
 
 
395
 
396
  except Exception as e:
397
- last_exc = e
398
- if is_retryable_error(e):
399
  logging.warning(
400
- f"Gemini transient error on attempt {attempt + 1}/{retries + 1}: {e}"
 
401
  )
402
- if attempt < retries:
403
- sleep_for_retry(attempt)
404
- continue
405
- if is_timeout_error(e):
406
- raise GeminiTimeoutError(str(e))
407
- raise GeminiTransientError(str(e))
408
-
409
- logging.error(f"Gemini fatal error: {e}")
410
- raise GeminiFatalError(str(e))
411
 
412
- if last_exc:
413
- if is_timeout_error(last_exc):
414
- raise GeminiTimeoutError(str(last_exc))
415
- raise GeminiTransientError(str(last_exc))
416
 
417
  return {"transactions": []}
418
 
419
- def call_gemini_students_with_retry(model, content, prompt, retries=GEMINI_RETRIES):
 
420
  """
421
- Generic runner for student Gemini extraction.
 
422
  """
423
- last_exc = None
424
-
425
  for attempt in range(retries + 1):
426
  try:
427
  response = model.generate_content(
428
  [prompt, content],
429
- generation_config={
430
- "temperature": 0,
431
- "response_mime_type": "application/json",
432
- }
433
  )
434
 
435
  try:
@@ -437,36 +368,33 @@ def call_gemini_students_with_retry(model, content, prompt, retries=GEMINI_RETRI
437
  if 'students' not in result or not isinstance(result.get('students'), list):
438
  return {"students": []}
439
  return result
 
440
  except ValueError as ve:
441
- broken_json = str(ve)
442
- repaired = repair_json_with_gemini(model, broken_json, fallback_key="students")
443
  if 'students' not in repaired or not isinstance(repaired.get('students'), list):
444
  return {"students": []}
445
  return repaired
446
 
447
  except Exception as e:
448
- last_exc = e
449
- if is_retryable_error(e):
450
  logging.warning(
451
- f"Gemini student transient error on attempt {attempt + 1}/{retries + 1}: {e}"
 
452
  )
453
- if attempt < retries:
454
- sleep_for_retry(attempt)
455
- continue
456
- if is_timeout_error(e):
457
- raise GeminiTimeoutError(str(e))
458
- raise GeminiTransientError(str(e))
459
-
460
- logging.error(f"Gemini student import fatal error: {e}")
461
- raise GeminiFatalError(str(e))
462
 
463
- if last_exc:
464
- if is_timeout_error(last_exc):
465
- raise GeminiTimeoutError(str(last_exc))
466
- raise GeminiTransientError(str(last_exc))
467
 
468
  return {"students": []}
469
 
 
 
 
 
470
  def is_file_empty(file_path):
471
  """Check if file is empty."""
472
  return os.path.getsize(file_path) == 0
@@ -517,140 +445,6 @@ def ensure_extra_fields_list(value):
517
  return cleaned
518
  return []
519
 
520
- # -------------------------------------------------------------------------
521
- # PDF / TEXT QUALITY HELPERS
522
- # -------------------------------------------------------------------------
523
-
524
- def text_quality_metrics(text: str) -> Dict[str, Any]:
525
- text = text or ""
526
- stripped = text.strip()
527
-
528
- if not stripped:
529
- return {
530
- "length": 0,
531
- "lines": 0,
532
- "alpha_ratio": 0,
533
- "digit_ratio": 0,
534
- "weird_ratio": 1,
535
- "date_hits": 0,
536
- "amount_hits": 0,
537
- "looks_usable": False,
538
- }
539
-
540
- length = len(stripped)
541
- lines = [line.strip() for line in stripped.splitlines() if line.strip()]
542
- joined = "\n".join(lines)
543
-
544
- alpha_count = sum(1 for c in joined if c.isalpha())
545
- digit_count = sum(1 for c in joined if c.isdigit())
546
- printable_count = sum(1 for c in joined if c.isprintable())
547
- weird_count = sum(
548
- 1 for c in joined
549
- if c.isprintable() and not (c.isalnum() or c.isspace() or c in ".,:/()-_&+'*#")
550
- )
551
-
552
- total_chars = max(len(joined), 1)
553
- alpha_ratio = alpha_count / total_chars
554
- digit_ratio = digit_count / total_chars
555
- weird_ratio = weird_count / total_chars
556
-
557
- date_hits = len(re.findall(r'\b\d{1,2}[/-]\d{1,2}(?:[/-]\d{2,4})?\b', joined))
558
- amount_hits = len(re.findall(r'\b\d{1,3}(?:[,\s]\d{3})*(?:\.\d{2})\b', joined))
559
-
560
- looks_usable = (
561
- length >= TEXT_MIN_MEANINGFUL_LENGTH and
562
- weird_ratio < 0.20 and
563
- printable_count > 0 and
564
- (alpha_ratio > 0.15 or date_hits > 1 or amount_hits > 2)
565
- )
566
-
567
- return {
568
- "length": length,
569
- "lines": len(lines),
570
- "alpha_ratio": round(alpha_ratio, 4),
571
- "digit_ratio": round(digit_ratio, 4),
572
- "weird_ratio": round(weird_ratio, 4),
573
- "date_hits": date_hits,
574
- "amount_hits": amount_hits,
575
- "looks_usable": looks_usable,
576
- }
577
-
578
- def should_use_text_strategy(text: str) -> bool:
579
- metrics = text_quality_metrics(text)
580
- return metrics["looks_usable"]
581
-
582
- def split_text_into_chunks(text: str, max_chars: int = MAX_TEXT_CHUNK_CHARS, max_lines: int = MAX_TEXT_CHUNK_LINES) -> List[str]:
583
- """
584
- Chunk large extracted page text to reduce timeout risk.
585
- Keeps line grouping to preserve statement row structure.
586
- """
587
- lines = [line.rstrip() for line in (text or "").splitlines() if line.strip()]
588
- if not lines:
589
- return []
590
-
591
- chunks = []
592
- current_lines = []
593
- current_len = 0
594
-
595
- for line in lines:
596
- proposed_len = current_len + len(line) + 1
597
- if current_lines and (proposed_len > max_chars or len(current_lines) >= max_lines):
598
- chunks.append("\n".join(current_lines))
599
- current_lines = [line]
600
- current_len = len(line) + 1
601
- else:
602
- current_lines.append(line)
603
- current_len = proposed_len
604
-
605
- if current_lines:
606
- chunks.append("\n".join(current_lines))
607
-
608
- return chunks
609
-
610
- def normalize_transaction(tx: Dict[str, Any]) -> Dict[str, Any]:
611
- normalized = {
612
- "Date": str(tx.get("Date", "")).strip(),
613
- "Description": normalize_whitespace(str(tx.get("Description", "")).strip()),
614
- "Customer_name": normalize_whitespace(str(tx.get("Customer_name", "N/A")).strip() or "N/A"),
615
- "City": normalize_whitespace(str(tx.get("City", "N/A")).strip() or "N/A"),
616
- "Amount": tx.get("Amount", 0),
617
- "Type": str(tx.get("Type", "")).strip(),
618
- "Destination_of_funds": normalize_whitespace(str(tx.get("Destination_of_funds", "")).strip()),
619
- "Document_Type": str(tx.get("Document_Type", "")).strip(),
620
- "Account_Category": str(tx.get("Account_Category", "")).strip(),
621
- }
622
-
623
- try:
624
- normalized["Amount"] = float(normalized["Amount"])
625
- except Exception:
626
- normalized["Amount"] = 0.0
627
-
628
- return normalized
629
-
630
- def dedupe_transactions(transactions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
631
- seen = set()
632
- unique = []
633
-
634
- for tx in transactions:
635
- item = normalize_transaction(tx)
636
- key = (
637
- item.get("Date", "").lower(),
638
- item.get("Description", "").lower(),
639
- round(float(item.get("Amount", 0) or 0), 2),
640
- item.get("Type", "").lower(),
641
- item.get("Document_Type", "").lower(),
642
- )
643
- if key in seen:
644
- continue
645
- seen.add(key)
646
- unique.append(item)
647
-
648
- return unique
649
-
650
- # -------------------------------------------------------------------------
651
- # STUDENT HELPERS
652
- # -------------------------------------------------------------------------
653
-
654
  def build_student_prompt(template_fields=None):
655
  template_fields = template_fields or {}
656
 
@@ -674,9 +468,7 @@ PRE-IMPORT CONFIGURATION:
674
  """
675
 
676
  def normalize_student_record(student, template_fields=None, sequence=None):
677
- """
678
- Normalizes one parsed student record into the required shape.
679
- """
680
  template_fields = template_fields or {}
681
  raw = student or {}
682
 
@@ -704,10 +496,8 @@ def normalize_student_record(student, template_fields=None, sequence=None):
704
  mapped[canonical] = str(normalized_raw[alias]).strip()
705
  break
706
 
707
- alias_flat = {a for aliases in alias_map.values() for a in aliases}
708
-
709
  for key, value in normalized_raw.items():
710
- if key in alias_flat:
711
  continue
712
  if key == "extra_fields":
713
  continue
@@ -891,12 +681,10 @@ def parse_students_from_pdf(model, pdf_path, template_fields=None):
891
  except Exception:
892
  text_content = ""
893
 
894
- if should_use_text_strategy(text_content):
895
- chunks = split_text_into_chunks(text_content)
896
- for chunk in chunks:
897
- result = call_gemini_students_with_retry(model, chunk, prompt)
898
- page_students = result.get('students', []) or []
899
- all_students.extend(page_students)
900
  else:
901
  if PDF_IMAGE_SUPPORT:
902
  page_students = process_student_pdf_page_as_image(
@@ -948,7 +736,7 @@ def read_spreadsheet_students(file_path, filename, template_fields=None):
948
  return parse_students_from_dataframe(df, template_fields=template_fields)
949
 
950
  # -------------------------------------------------------------------------
951
- # CORE LOGIC: PDF PROCESSING (TEXT + CHUNKING + VISION FALLBACK)
952
  # -------------------------------------------------------------------------
953
 
954
  def process_pdf_page_as_image(model, pdf_path, page_num):
@@ -963,139 +751,19 @@ def process_pdf_page_as_image(model, pdf_path, page_num):
963
  result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
964
  return result.get('transactions', [])
965
 
966
- def process_text_chunks_for_transactions(model, text_content: str) -> List[Dict[str, Any]]:
967
- """
968
- Split large text into smaller chunks before sending to Gemini.
969
- This lowers timeout risk for dense bank statement pages.
970
- """
971
- chunks = split_text_into_chunks(text_content)
972
- if not chunks:
973
- return []
974
-
975
- all_transactions = []
976
- for idx, chunk in enumerate(chunks, start=1):
977
- logging.info(f"Processing text chunk {idx}/{len(chunks)}")
978
- result = call_gemini_with_retry(model, chunk, FINANCIAL_DOC_PROMPT)
979
- all_transactions.extend(result.get('transactions', []))
980
-
981
- return dedupe_transactions(all_transactions)
982
-
983
- def process_pdf_page_with_fallback(model, pdf_path: str, page_num: int, text_content: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
984
- """
985
- Page pipeline:
986
- 1. If text quality is good, try chunked text strategy.
987
- 2. On repeated 504/timeout/transient failure, fall back to vision for that page.
988
- 3. If text quality is poor, go directly to vision.
989
- """
990
- metrics = text_quality_metrics(text_content)
991
- page_summary = {
992
- "page": page_num,
993
- "text_quality": metrics,
994
- "strategy_used": None,
995
- "fallback_triggered": False,
996
- "transactions_extracted": 0,
997
- "status": "pending",
998
- "error": None,
999
- }
1000
-
1001
- try:
1002
- if should_use_text_strategy(text_content):
1003
- page_summary["strategy_used"] = "text"
1004
- logging.info(f"Page {page_num}: using chunked text strategy")
1005
- try:
1006
- txs = process_text_chunks_for_transactions(model, text_content)
1007
- page_summary["transactions_extracted"] = len(txs)
1008
- page_summary["status"] = "processed"
1009
- return txs, page_summary
1010
- except (GeminiTimeoutError, GeminiTransientError) as e:
1011
- logging.warning(f"Page {page_num}: text strategy failed, falling back to vision. Error: {e}")
1012
- page_summary["fallback_triggered"] = True
1013
- page_summary["error"] = f"text_strategy_failed: {str(e)}"
1014
-
1015
- if PDF_IMAGE_SUPPORT:
1016
- page_summary["strategy_used"] = "vision_fallback_after_text_failure"
1017
- txs = process_pdf_page_as_image(model, pdf_path, page_num)
1018
- txs = dedupe_transactions(txs)
1019
- page_summary["transactions_extracted"] = len(txs)
1020
- page_summary["status"] = "processed"
1021
- return txs, page_summary
1022
- else:
1023
- page_summary["status"] = "failed"
1024
- page_summary["error"] = f"{page_summary['error']} | vision unavailable"
1025
- return [], page_summary
1026
- else:
1027
- logging.info(f"Page {page_num}: poor/low text quality. Using vision strategy directly.")
1028
- page_summary["strategy_used"] = "vision_direct"
1029
- if PDF_IMAGE_SUPPORT:
1030
- txs = process_pdf_page_as_image(model, pdf_path, page_num)
1031
- txs = dedupe_transactions(txs)
1032
- page_summary["transactions_extracted"] = len(txs)
1033
- page_summary["status"] = "processed"
1034
- return txs, page_summary
1035
- else:
1036
- page_summary["status"] = "failed"
1037
- page_summary["error"] = "low-quality text and vision unavailable"
1038
- return [], page_summary
1039
-
1040
- except Exception as e:
1041
- page_summary["status"] = "failed"
1042
- page_summary["error"] = str(e)
1043
- logging.error(f"Page {page_num}: final failure - {e}")
1044
- return [], page_summary
1045
-
1046
- # -------------------------------------------------------------------------
1047
- # ROOT / HEALTH
1048
- # -------------------------------------------------------------------------
1049
-
1050
- @app.route('/', methods=['GET'])
1051
- def index():
1052
- return jsonify({
1053
- "message": "API is running",
1054
- "version": "2.4.0",
1055
- "model": MODEL_NAME,
1056
- "vision_support": PDF_IMAGE_SUPPORT,
1057
- "endpoints": [
1058
- "/health",
1059
- "/process-pdf",
1060
- "/process-text",
1061
- "/process-image",
1062
- "/transaction-types",
1063
- "/api/customers/parse-students-images",
1064
- "/api/customers/validate-students-import",
1065
- "/api/customers/parse-students-manual"
1066
- ]
1067
- })
1068
-
1069
- @app.route('/health', methods=['GET'])
1070
- def health_check():
1071
- return jsonify({
1072
- 'status': 'healthy',
1073
- 'timestamp': datetime.now().isoformat(),
1074
- 'version': '2.4.0',
1075
- 'vision_support': PDF_IMAGE_SUPPORT,
1076
- 'model': MODEL_NAME
1077
- })
1078
-
1079
- # -------------------------------------------------------------------------
1080
- # FINANCIAL ENDPOINTS
1081
- # -------------------------------------------------------------------------
1082
-
1083
  @app.route('/process-pdf', methods=['POST'])
1084
  def process_pdf():
1085
  """
1086
  Smart PDF Processor:
1087
  1. Checks if empty.
1088
- 2. Tries text extraction per page.
1089
- 3. Uses text-quality gating.
1090
- 4. Chunk-processes large text pages.
1091
- 5. On repeated 504/timeout/transient errors, falls back to Vision for that page.
1092
- 6. Returns partial success instead of failing the whole document for one bad page.
1093
  """
1094
  temp_path = None
1095
  try:
1096
  if 'file' not in request.files:
1097
  return jsonify({'error': 'No file uploaded'}), 400
1098
-
1099
  file = request.files['file']
1100
  if file.filename == '':
1101
  return jsonify({'error': 'No file selected'}), 400
@@ -1107,99 +775,69 @@ def process_pdf():
1107
  if is_file_empty(temp_path):
1108
  return jsonify({'error': 'Uploaded file is empty'}), 400
1109
 
1110
- if not api_key:
1111
- return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
1112
-
1113
  model = configure_gemini(api_key)
1114
  all_transactions = []
1115
- pages_summary = []
1116
- failed_pages = []
1117
 
1118
  try:
1119
  reader = pypdf.PdfReader(temp_path)
1120
  num_pages = len(reader.pages)
1121
 
1122
  for i in range(num_pages):
1123
- page_num = i + 1
1124
- logging.info(f"Processing page {page_num}/{num_pages}")
1125
 
 
1126
  try:
1127
- text_content = reader.pages[i].extract_text() or ""
1128
- except Exception as e:
1129
- logging.warning(f"Page {page_num}: text extraction failed: {e}")
1130
  text_content = ""
1131
 
1132
- txs, page_summary = process_pdf_page_with_fallback(
1133
- model=model,
1134
- pdf_path=temp_path,
1135
- page_num=page_num,
1136
- text_content=text_content
1137
- )
1138
 
1139
- all_transactions.extend(txs)
1140
- pages_summary.append(page_summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1141
 
1142
- if page_summary["status"] != "processed":
1143
- failed_pages.append({
1144
- "page": page_num,
1145
- "error": page_summary.get("error")
1146
- })
1147
 
1148
  except pypdf.errors.PdfReadError:
1149
  logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
1150
  if PDF_IMAGE_SUPPORT:
1151
  images = convert_from_path(temp_path)
1152
- for idx, img in enumerate(images, start=1):
1153
- page_summary = {
1154
- "page": idx,
1155
- "text_quality": None,
1156
- "strategy_used": "vision_full_pdf_fallback",
1157
- "fallback_triggered": True,
1158
- "transactions_extracted": 0,
1159
- "status": "pending",
1160
- "error": None,
1161
- }
1162
- try:
1163
- result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
1164
- txs = dedupe_transactions(result.get('transactions', []))
1165
- all_transactions.extend(txs)
1166
- page_summary["transactions_extracted"] = len(txs)
1167
- page_summary["status"] = "processed"
1168
- except Exception as e:
1169
- page_summary["status"] = "failed"
1170
- page_summary["error"] = str(e)
1171
- failed_pages.append({
1172
- "page": idx,
1173
- "error": str(e)
1174
- })
1175
- pages_summary.append(page_summary)
1176
  else:
1177
  raise ValueError("PDF is unreadable and Vision fallback is unavailable.")
1178
 
1179
- all_transactions = dedupe_transactions(all_transactions)
1180
-
1181
- total_pages = len(pages_summary)
1182
- processed_pages = len([p for p in pages_summary if p["status"] == "processed"])
1183
- total_failed_pages = len([p for p in pages_summary if p["status"] != "processed"])
1184
-
1185
- response_payload = {
1186
- 'transactions': all_transactions,
1187
- 'summary': {
1188
- 'pages_total': total_pages,
1189
- 'pages_processed': processed_pages,
1190
- 'pages_failed': total_failed_pages,
1191
- 'transactions_total': len(all_transactions),
1192
- 'partial_success': total_failed_pages > 0 and processed_pages > 0,
1193
- 'success': processed_pages > 0
1194
- },
1195
- 'pages': pages_summary,
1196
- 'failed_pages': failed_pages
1197
- }
1198
-
1199
- if processed_pages == 0:
1200
- return jsonify(response_payload), 500
1201
-
1202
- return jsonify(response_payload), 200
1203
 
1204
  except Exception as e:
1205
  logging.error(f"Server Error: {e}")
@@ -1208,6 +846,10 @@ def process_pdf():
1208
  if temp_path and os.path.exists(temp_path):
1209
  os.remove(temp_path)
1210
 
 
 
 
 
1211
  @app.route('/process-text', methods=['POST'])
1212
  def process_text():
1213
  """Handle raw text input."""
@@ -1217,23 +859,14 @@ def process_text():
1217
  return jsonify({'error': 'No text provided'}), 400
1218
 
1219
  text_input = data['text']
1220
- if not str(text_input).strip():
1221
  return jsonify({'error': 'Text input cannot be empty'}), 400
1222
 
1223
- if not api_key:
1224
- return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
1225
-
1226
  model = configure_gemini(api_key)
1227
  prompt = get_text_prompt_with_fallback_date()
1228
 
1229
- chunks = split_text_into_chunks(text_input)
1230
- all_transactions = []
1231
-
1232
- for chunk in chunks or [text_input]:
1233
- result = call_gemini_with_retry(model, chunk, prompt)
1234
- all_transactions.extend(result.get('transactions', []))
1235
-
1236
- return jsonify({'transactions': dedupe_transactions(all_transactions)})
1237
 
1238
  except Exception as e:
1239
  logging.error(f"Error: {e}")
@@ -1258,14 +891,11 @@ def process_image():
1258
  file.save(tmp.name)
1259
  temp_path = tmp.name
1260
 
1261
- if not api_key:
1262
- return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
1263
-
1264
  model = configure_gemini(api_key)
1265
  img = Image.open(temp_path)
1266
  result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
1267
 
1268
- return jsonify({'transactions': dedupe_transactions(result.get('transactions', []))})
1269
 
1270
  except Exception as e:
1271
  logging.error(f"Error: {e}")
@@ -1299,9 +929,6 @@ def parse_students_images():
1299
  if not uploaded_files:
1300
  return jsonify({"error": "No files uploaded"}), 400
1301
 
1302
- if not api_key:
1303
- return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
1304
-
1305
  template_fields = parse_json_safely(request.form.get("template_fields"), default={})
1306
  model = configure_gemini(api_key)
1307
 
@@ -1528,9 +1155,14 @@ def get_transaction_types():
1528
  }
1529
  return jsonify(transaction_types)
1530
 
1531
- # -------------------------------------------------------------------------
1532
- # MAIN
1533
- # -------------------------------------------------------------------------
 
 
 
 
 
1534
 
1535
  if __name__ == '__main__':
1536
  app.run(debug=True, host="0.0.0.0", port=7860)
 
6
  import time
7
  from datetime import datetime
8
  from io import BytesIO
 
9
 
10
  # Third-party imports
11
  from flask import Flask, request, jsonify
 
34
  if not api_key:
35
  logging.warning("Gemini API key not found in environment variables.")
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def configure_gemini(api_key):
38
  """Configure Gemini AI model."""
39
  try:
40
  genai.configure(api_key=api_key)
41
+ return genai.GenerativeModel('gemini-2.5-flash')
42
  except Exception as e:
43
  logging.error(f"Error configuring Gemini: {str(e)}")
44
  raise
 
56
  - Do NOT use the current date (today) unless the document explicitly says "Today".
57
  2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND.
58
  3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals.
 
 
 
59
 
60
  FIELDS TO EXTRACT:
61
  - Date: string (DD/MM/YYYY)
 
136
  """
137
  current_date = datetime.now().strftime("%d/%m/%Y")
138
  return f"""IMPORTANT: Today's date is {current_date}.
139
+ If the text below does not specify a year or date, reasonable assume {current_date} context, but prefer explicit dates in text.
140
 
141
  {FINANCIAL_DOC_PROMPT}
142
  """
 
149
  """
150
  Categorizes a transaction based strictly on its Type field.
151
  """
152
+ tx_type = transaction.get('Type', '').lower()
153
+ description = transaction.get('Description', '').lower()
154
+ destination = transaction.get('Destination_of_funds', '').lower()
155
 
156
  account_category = "Uncategorized"
157
 
 
264
  # HELPER FUNCTIONS
265
  # -------------------------------------------------------------------------
266
 
 
 
 
267
  def extract_json_from_response(response_text):
268
  """Extract valid JSON from Gemini's response, handling Markdown fences."""
269
+ cleaned_text = re.sub(r'```json\s*', '', response_text)
270
  cleaned_text = re.sub(r'```\s*', '', cleaned_text)
271
 
272
  match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
 
292
  logging.error(f"JSON repair failed: {e}")
293
  return {fallback_key: []}
294
 
295
+ # -------------------------------------------------------------------------
296
+ # RETRYABLE ERROR DETECTION
297
+ # -------------------------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
298
 
299
+ RETRYABLE_CODES = ["429", "503", "504", "ResourceExhausted", "DeadlineExceeded", "UNAVAILABLE"]
 
 
300
 
301
+ def is_retryable(error: Exception) -> bool:
302
+ err = str(error)
303
+ return any(code in err for code in RETRYABLE_CODES)
 
 
 
304
 
305
+ # -------------------------------------------------------------------------
306
+ # CORE GEMINI CALLERS WITH RETRY + 504 HANDLING
307
+ # -------------------------------------------------------------------------
308
+
309
+ def call_gemini_with_retry(model, content, prompt, retries=3):
310
  """
311
+ Financial extraction runner.
312
+ Retries on 429, 503, 504 / DeadlineExceeded with exponential backoff.
 
313
  """
 
 
314
  for attempt in range(retries + 1):
315
  try:
316
  response = model.generate_content(
317
  [prompt, content],
318
+ request_options={"timeout": 120}
 
 
 
319
  )
320
 
321
  try:
322
  result = extract_json_from_response(response.text)
323
+ if 'transactions' in result:
324
+ result['transactions'] = [
325
+ categorize_transaction(tx) for tx in result['transactions']
326
+ ]
327
+ return result
328
+
329
  except ValueError as ve:
330
+ repaired = repair_json_with_gemini(model, str(ve), fallback_key="transactions")
331
+ if 'transactions' in repaired:
332
+ repaired['transactions'] = [
333
+ categorize_transaction(tx) for tx in repaired['transactions']
334
+ ]
335
+ return repaired
336
 
337
  except Exception as e:
338
+ if is_retryable(e) and attempt < retries:
339
+ wait = 2 ** attempt # 1s → 2s → 4s
340
  logging.warning(
341
+ f"Gemini retryable error (attempt {attempt + 1}/{retries + 1}): {e}. "
342
+ f"Retrying in {wait}s..."
343
  )
344
+ time.sleep(wait)
345
+ continue
 
 
 
 
 
 
 
346
 
347
+ logging.error(f"Gemini Error: {e}")
348
+ if attempt == retries:
349
+ raise
 
350
 
351
  return {"transactions": []}
352
 
353
+
354
+ def call_gemini_students_with_retry(model, content, prompt, retries=3):
355
  """
356
+ Student extraction runner.
357
+ Retries on 429, 503, 504 / DeadlineExceeded with exponential backoff.
358
  """
 
 
359
  for attempt in range(retries + 1):
360
  try:
361
  response = model.generate_content(
362
  [prompt, content],
363
+ request_options={"timeout": 120}
 
 
 
364
  )
365
 
366
  try:
 
368
  if 'students' not in result or not isinstance(result.get('students'), list):
369
  return {"students": []}
370
  return result
371
+
372
  except ValueError as ve:
373
+ repaired = repair_json_with_gemini(model, str(ve), fallback_key="students")
 
374
  if 'students' not in repaired or not isinstance(repaired.get('students'), list):
375
  return {"students": []}
376
  return repaired
377
 
378
  except Exception as e:
379
+ if is_retryable(e) and attempt < retries:
380
+ wait = 2 ** attempt
381
  logging.warning(
382
+ f"Gemini Student retryable error (attempt {attempt + 1}/{retries + 1}): {e}. "
383
+ f"Retrying in {wait}s..."
384
  )
385
+ time.sleep(wait)
386
+ continue
 
 
 
 
 
 
 
387
 
388
+ logging.error(f"Gemini Student Import Error: {e}")
389
+ if attempt == retries:
390
+ raise
 
391
 
392
  return {"students": []}
393
 
394
+ # -------------------------------------------------------------------------
395
+ # UTILITY FUNCTIONS
396
+ # -------------------------------------------------------------------------
397
+
398
  def is_file_empty(file_path):
399
  """Check if file is empty."""
400
  return os.path.getsize(file_path) == 0
 
445
  return cleaned
446
  return []
447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
448
  def build_student_prompt(template_fields=None):
449
  template_fields = template_fields or {}
450
 
 
468
  """
469
 
470
  def normalize_student_record(student, template_fields=None, sequence=None):
471
+ """Normalizes one parsed student record into the required shape."""
 
 
472
  template_fields = template_fields or {}
473
  raw = student or {}
474
 
 
496
  mapped[canonical] = str(normalized_raw[alias]).strip()
497
  break
498
 
 
 
499
  for key, value in normalized_raw.items():
500
+ if key in {a for aliases in alias_map.values() for a in aliases}:
501
  continue
502
  if key == "extra_fields":
503
  continue
 
681
  except Exception:
682
  text_content = ""
683
 
684
+ if text_content and len(text_content.strip()) > 50:
685
+ result = call_gemini_students_with_retry(model, text_content, prompt)
686
+ page_students = result.get('students', []) or []
687
+ all_students.extend(page_students)
 
 
688
  else:
689
  if PDF_IMAGE_SUPPORT:
690
  page_students = process_student_pdf_page_as_image(
 
736
  return parse_students_from_dataframe(df, template_fields=template_fields)
737
 
738
  # -------------------------------------------------------------------------
739
+ # CORE LOGIC: PDF PROCESSING (HYBRID TEXT + VISION)
740
  # -------------------------------------------------------------------------
741
 
742
  def process_pdf_page_as_image(model, pdf_path, page_num):
 
751
  result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
752
  return result.get('transactions', [])
753
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
754
  @app.route('/process-pdf', methods=['POST'])
755
  def process_pdf():
756
  """
757
  Smart PDF Processor:
758
  1. Checks if empty.
759
+ 2. Tries standard Text extraction per page.
760
+ 3. On 504/timeout or low text, falls back to Vision per page.
761
+ 4. If Vision also fails, skips page and continues (no full document crash).
 
 
762
  """
763
  temp_path = None
764
  try:
765
  if 'file' not in request.files:
766
  return jsonify({'error': 'No file uploaded'}), 400
 
767
  file = request.files['file']
768
  if file.filename == '':
769
  return jsonify({'error': 'No file selected'}), 400
 
775
  if is_file_empty(temp_path):
776
  return jsonify({'error': 'Uploaded file is empty'}), 400
777
 
 
 
 
778
  model = configure_gemini(api_key)
779
  all_transactions = []
 
 
780
 
781
  try:
782
  reader = pypdf.PdfReader(temp_path)
783
  num_pages = len(reader.pages)
784
 
785
  for i in range(num_pages):
786
+ logging.info(f"Processing page {i+1}/{num_pages}")
 
787
 
788
+ # --- Extract text ---
789
  try:
790
+ text_content = reader.pages[i].extract_text()
791
+ except Exception:
 
792
  text_content = ""
793
 
794
+ txs = []
 
 
 
 
 
795
 
796
+ # --- Text strategy ---
797
+ if text_content and len(text_content.strip()) > 50:
798
+ logging.info("Text detected. Using Text Strategy.")
799
+ try:
800
+ result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT)
801
+ txs = result.get('transactions', [])
802
+ except Exception as text_err:
803
+ # 504 / any failure on text path → fall through to vision
804
+ logging.warning(
805
+ f"Text strategy failed on page {i+1}: {text_err}. "
806
+ f"Falling back to Vision."
807
+ )
808
+ text_content = "" # force vision branch below
809
+
810
+ # --- Vision fallback (low text OR text strategy failure) ---
811
+ if not txs and (not text_content or len(text_content.strip()) <= 50):
812
+ if PDF_IMAGE_SUPPORT:
813
+ logging.info(
814
+ f"Page {i+1}: Using Vision Strategy."
815
+ )
816
+ try:
817
+ txs = process_pdf_page_as_image(model, temp_path, i + 1)
818
+ except Exception as vision_err:
819
+ logging.error(
820
+ f"Vision also failed on page {i+1}: {vision_err}. Skipping page."
821
+ )
822
+ txs = []
823
+ else:
824
+ logging.warning(
825
+ f"Page {i+1}: Low/no text and Vision unavailable. Skipping."
826
+ )
827
 
828
+ all_transactions.extend(txs)
 
 
 
 
829
 
830
  except pypdf.errors.PdfReadError:
831
  logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
832
  if PDF_IMAGE_SUPPORT:
833
  images = convert_from_path(temp_path)
834
+ for img in images:
835
+ result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
836
+ all_transactions.extend(result.get('transactions', []))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
837
  else:
838
  raise ValueError("PDF is unreadable and Vision fallback is unavailable.")
839
 
840
+ return jsonify({'transactions': all_transactions})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
 
842
  except Exception as e:
843
  logging.error(f"Server Error: {e}")
 
846
  if temp_path and os.path.exists(temp_path):
847
  os.remove(temp_path)
848
 
849
+ # -------------------------------------------------------------------------
850
+ # TEXT & IMAGE ENDPOINTS
851
+ # -------------------------------------------------------------------------
852
+
853
  @app.route('/process-text', methods=['POST'])
854
  def process_text():
855
  """Handle raw text input."""
 
859
  return jsonify({'error': 'No text provided'}), 400
860
 
861
  text_input = data['text']
862
+ if not text_input.strip():
863
  return jsonify({'error': 'Text input cannot be empty'}), 400
864
 
 
 
 
865
  model = configure_gemini(api_key)
866
  prompt = get_text_prompt_with_fallback_date()
867
 
868
+ result = call_gemini_with_retry(model, text_input, prompt)
869
+ return jsonify({'transactions': result.get('transactions', [])})
 
 
 
 
 
 
870
 
871
  except Exception as e:
872
  logging.error(f"Error: {e}")
 
891
  file.save(tmp.name)
892
  temp_path = tmp.name
893
 
 
 
 
894
  model = configure_gemini(api_key)
895
  img = Image.open(temp_path)
896
  result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
897
 
898
+ return jsonify({'transactions': result.get('transactions', [])})
899
 
900
  except Exception as e:
901
  logging.error(f"Error: {e}")
 
929
  if not uploaded_files:
930
  return jsonify({"error": "No files uploaded"}), 400
931
 
 
 
 
932
  template_fields = parse_json_safely(request.form.get("template_fields"), default={})
933
  model = configure_gemini(api_key)
934
 
 
1155
  }
1156
  return jsonify(transaction_types)
1157
 
1158
+ @app.route('/health', methods=['GET'])
1159
+ def health_check():
1160
+ return jsonify({
1161
+ 'status': 'healthy',
1162
+ 'timestamp': datetime.now().isoformat(),
1163
+ 'version': '2.4.0',
1164
+ 'vision_support': PDF_IMAGE_SUPPORT
1165
+ })
1166
 
1167
  if __name__ == '__main__':
1168
  app.run(debug=True, host="0.0.0.0", port=7860)