rairo commited on
Commit
3a70f45
·
verified ·
1 Parent(s): 179236b

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +502 -79
main.py CHANGED
@@ -6,6 +6,7 @@ import tempfile
6
  import time
7
  from datetime import datetime
8
  from io import BytesIO
 
9
 
10
  # Third-party imports
11
  from flask import Flask, request, jsonify
@@ -34,11 +35,39 @@ api_key = os.getenv('Gemini')
34
  if not api_key:
35
  logging.warning("Gemini API key not found in environment variables.")
36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  def configure_gemini(api_key):
38
  """Configure Gemini AI model."""
39
  try:
40
  genai.configure(api_key=api_key)
41
- return genai.GenerativeModel('gemini-2.5-flash')
42
  except Exception as e:
43
  logging.error(f"Error configuring Gemini: {str(e)}")
44
  raise
@@ -56,6 +85,9 @@ RULES:
56
  - Do NOT use the current date (today) unless the document explicitly says "Today".
57
  2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND.
58
  3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals.
 
 
 
59
 
60
  FIELDS TO EXTRACT:
61
  - Date: string (DD/MM/YYYY)
@@ -136,7 +168,7 @@ def get_text_prompt_with_fallback_date():
136
  """
137
  current_date = datetime.now().strftime("%d/%m/%Y")
138
  return f"""IMPORTANT: Today's date is {current_date}.
139
- If the text below does not specify a year or date, reasonable assume {current_date} context, but prefer explicit dates in text.
140
 
141
  {FINANCIAL_DOC_PROMPT}
142
  """
@@ -149,9 +181,9 @@ def categorize_transaction(transaction):
149
  """
150
  Categorizes a transaction based strictly on its Type field.
151
  """
152
- tx_type = transaction.get('Type', '').lower()
153
- description = transaction.get('Description', '').lower()
154
- destination = transaction.get('Destination_of_funds', '').lower()
155
 
156
  account_category = "Uncategorized"
157
 
@@ -264,9 +296,12 @@ def categorize_transaction(transaction):
264
  # HELPER FUNCTIONS
265
  # -------------------------------------------------------------------------
266
 
 
 
 
267
  def extract_json_from_response(response_text):
268
  """Extract valid JSON from Gemini's response, handling Markdown fences."""
269
- cleaned_text = re.sub(r'```json\s*', '', response_text)
270
  cleaned_text = re.sub(r'```\s*', '', cleaned_text)
271
 
272
  match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
@@ -292,51 +327,110 @@ Broken JSON: {broken_json_string}"""
292
  logging.error(f"JSON repair failed: {e}")
293
  return {fallback_key: []}
294
 
295
- def call_gemini_with_retry(model, content, prompt, retries=2):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  """
297
  Generic runner for financial Gemini extraction.
 
 
298
  """
 
 
299
  for attempt in range(retries + 1):
300
  try:
301
- response = model.generate_content([prompt, content])
 
 
 
 
 
 
302
 
303
  try:
304
  result = extract_json_from_response(response.text)
305
-
306
- if 'transactions' in result:
307
- result['transactions'] = [
308
- categorize_transaction(tx) for tx in result['transactions']
309
- ]
310
-
311
- return result
312
  except ValueError as ve:
313
  broken_json = str(ve)
314
  repaired = repair_json_with_gemini(model, broken_json, fallback_key="transactions")
 
315
 
316
- if 'transactions' in repaired:
317
- repaired['transactions'] = [
318
- categorize_transaction(tx) for tx in repaired['transactions']
319
- ]
 
 
 
 
 
 
 
 
320
 
321
- return repaired
 
322
 
323
- except Exception as e:
324
- if "429" in str(e) or "ResourceExhausted" in str(e):
325
- time.sleep(2 * (attempt + 1))
326
- continue
327
- logging.error(f"Gemini Error: {e}")
328
- if attempt == retries:
329
- raise
330
 
331
  return {"transactions": []}
332
 
333
- def call_gemini_students_with_retry(model, content, prompt, retries=2):
334
  """
335
  Generic runner for student Gemini extraction.
336
  """
 
 
337
  for attempt in range(retries + 1):
338
  try:
339
- response = model.generate_content([prompt, content])
 
 
 
 
 
 
340
 
341
  try:
342
  result = extract_json_from_response(response.text)
@@ -351,12 +445,25 @@ def call_gemini_students_with_retry(model, content, prompt, retries=2):
351
  return repaired
352
 
353
  except Exception as e:
354
- if "429" in str(e) or "ResourceExhausted" in str(e):
355
- time.sleep(2 * (attempt + 1))
356
- continue
357
- logging.error(f"Gemini Student Import Error: {e}")
358
- if attempt == retries:
359
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
360
 
361
  return {"students": []}
362
 
@@ -410,6 +517,140 @@ def ensure_extra_fields_list(value):
410
  return cleaned
411
  return []
412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
413
  def build_student_prompt(template_fields=None):
414
  template_fields = template_fields or {}
415
 
@@ -463,8 +704,10 @@ def normalize_student_record(student, template_fields=None, sequence=None):
463
  mapped[canonical] = str(normalized_raw[alias]).strip()
464
  break
465
 
 
 
466
  for key, value in normalized_raw.items():
467
- if key in {a for aliases in alias_map.values() for a in aliases}:
468
  continue
469
  if key == "extra_fields":
470
  continue
@@ -648,10 +891,12 @@ def parse_students_from_pdf(model, pdf_path, template_fields=None):
648
  except Exception:
649
  text_content = ""
650
 
651
- if text_content and len(text_content.strip()) > 50:
652
- result = call_gemini_students_with_retry(model, text_content, prompt)
653
- page_students = result.get('students', []) or []
654
- all_students.extend(page_students)
 
 
655
  else:
656
  if PDF_IMAGE_SUPPORT:
657
  page_students = process_student_pdf_page_as_image(
@@ -703,7 +948,7 @@ def read_spreadsheet_students(file_path, filename, template_fields=None):
703
  return parse_students_from_dataframe(df, template_fields=template_fields)
704
 
705
  # -------------------------------------------------------------------------
706
- # CORE LOGIC: PDF PROCESSING (HYBRID TEXT + VISION)
707
  # -------------------------------------------------------------------------
708
 
709
  def process_pdf_page_as_image(model, pdf_path, page_num):
@@ -718,18 +963,139 @@ def process_pdf_page_as_image(model, pdf_path, page_num):
718
  result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
719
  return result.get('transactions', [])
720
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
721
  @app.route('/process-pdf', methods=['POST'])
722
  def process_pdf():
723
  """
724
  Smart PDF Processor:
725
  1. Checks if empty.
726
- 2. Tries standard Text extraction.
727
- 3. If Text fails or is empty, falls back to Vision.
 
 
 
728
  """
729
  temp_path = None
730
  try:
731
  if 'file' not in request.files:
732
  return jsonify({'error': 'No file uploaded'}), 400
 
733
  file = request.files['file']
734
  if file.filename == '':
735
  return jsonify({'error': 'No file selected'}), 400
@@ -741,48 +1107,99 @@ def process_pdf():
741
  if is_file_empty(temp_path):
742
  return jsonify({'error': 'Uploaded file is empty'}), 400
743
 
 
 
 
744
  model = configure_gemini(api_key)
745
  all_transactions = []
 
 
746
 
747
  try:
748
  reader = pypdf.PdfReader(temp_path)
749
  num_pages = len(reader.pages)
750
 
751
  for i in range(num_pages):
752
- logging.info(f"Processing page {i+1}/{num_pages}")
 
753
 
754
  try:
755
- text_content = reader.pages[i].extract_text()
756
- except Exception:
 
757
  text_content = ""
758
 
759
- if text_content and len(text_content.strip()) > 50:
760
- logging.info("Text detected. Using Text Strategy.")
761
- result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT)
762
- else:
763
- logging.info("Low text/Encryption detected. Switching to Vision Strategy.")
764
- if PDF_IMAGE_SUPPORT:
765
- txs = process_pdf_page_as_image(model, temp_path, i + 1)
766
- all_transactions.extend(txs)
767
- continue
768
- else:
769
- logging.warning("Cannot process scanned PDF - pdf2image missing.")
770
- result = {"transactions": []}
771
 
772
- txs = result.get('transactions', [])
773
  all_transactions.extend(txs)
 
 
 
 
 
 
 
774
 
775
  except pypdf.errors.PdfReadError:
776
  logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
777
  if PDF_IMAGE_SUPPORT:
778
  images = convert_from_path(temp_path)
779
- for img in images:
780
- result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
781
- all_transactions.extend(result.get('transactions', []))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
782
  else:
783
  raise ValueError("PDF is unreadable and Vision fallback is unavailable.")
784
 
785
- return jsonify({'transactions': all_transactions})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786
 
787
  except Exception as e:
788
  logging.error(f"Server Error: {e}")
@@ -791,10 +1208,6 @@ def process_pdf():
791
  if temp_path and os.path.exists(temp_path):
792
  os.remove(temp_path)
793
 
794
- # -------------------------------------------------------------------------
795
- # TEXT & IMAGE ENDPOINTS
796
- # -------------------------------------------------------------------------
797
-
798
  @app.route('/process-text', methods=['POST'])
799
  def process_text():
800
  """Handle raw text input."""
@@ -804,14 +1217,23 @@ def process_text():
804
  return jsonify({'error': 'No text provided'}), 400
805
 
806
  text_input = data['text']
807
- if not text_input.strip():
808
  return jsonify({'error': 'Text input cannot be empty'}), 400
809
 
 
 
 
810
  model = configure_gemini(api_key)
811
  prompt = get_text_prompt_with_fallback_date()
812
 
813
- result = call_gemini_with_retry(model, text_input, prompt)
814
- return jsonify({'transactions': result.get('transactions', [])})
 
 
 
 
 
 
815
 
816
  except Exception as e:
817
  logging.error(f"Error: {e}")
@@ -836,11 +1258,14 @@ def process_image():
836
  file.save(tmp.name)
837
  temp_path = tmp.name
838
 
 
 
 
839
  model = configure_gemini(api_key)
840
  img = Image.open(temp_path)
841
  result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
842
 
843
- return jsonify({'transactions': result.get('transactions', [])})
844
 
845
  except Exception as e:
846
  logging.error(f"Error: {e}")
@@ -874,6 +1299,9 @@ def parse_students_images():
874
  if not uploaded_files:
875
  return jsonify({"error": "No files uploaded"}), 400
876
 
 
 
 
877
  template_fields = parse_json_safely(request.form.get("template_fields"), default={})
878
  model = configure_gemini(api_key)
879
 
@@ -1100,14 +1528,9 @@ def get_transaction_types():
1100
  }
1101
  return jsonify(transaction_types)
1102
 
1103
- @app.route('/health', methods=['GET'])
1104
- def health_check():
1105
- return jsonify({
1106
- 'status': 'healthy',
1107
- 'timestamp': datetime.now().isoformat(),
1108
- 'version': '2.3.0',
1109
- 'vision_support': PDF_IMAGE_SUPPORT
1110
- })
1111
 
1112
  if __name__ == '__main__':
1113
  app.run(debug=True, host="0.0.0.0", port=7860)
 
6
  import time
7
  from datetime import datetime
8
  from io import BytesIO
9
+ from typing import List, Dict, Any, Tuple
10
 
11
  # Third-party imports
12
  from flask import Flask, request, jsonify
 
35
  if not api_key:
36
  logging.warning("Gemini API key not found in environment variables.")
37
 
38
+ # -------------------------------------------------------------------------
39
+ # CONSTANTS
40
+ # -------------------------------------------------------------------------
41
+
42
+ MODEL_NAME = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
43
+ MAX_TEXT_CHUNK_CHARS = int(os.getenv("MAX_TEXT_CHUNK_CHARS", "12000"))
44
+ MAX_TEXT_CHUNK_LINES = int(os.getenv("MAX_TEXT_CHUNK_LINES", "120"))
45
+ GEMINI_RETRIES = int(os.getenv("GEMINI_RETRIES", "3"))
46
+ RETRY_BASE_SLEEP_SECONDS = float(os.getenv("RETRY_BASE_SLEEP_SECONDS", "2"))
47
+ TEXT_MIN_MEANINGFUL_LENGTH = int(os.getenv("TEXT_MIN_MEANINGFUL_LENGTH", "80"))
48
+
49
+ # -------------------------------------------------------------------------
50
+ # CUSTOM EXCEPTIONS
51
+ # -------------------------------------------------------------------------
52
+
53
+ class GeminiTransientError(Exception):
54
+ pass
55
+
56
+ class GeminiTimeoutError(GeminiTransientError):
57
+ pass
58
+
59
+ class GeminiFatalError(Exception):
60
+ pass
61
+
62
+ # -------------------------------------------------------------------------
63
+ # GEMINI SETUP
64
+ # -------------------------------------------------------------------------
65
+
66
  def configure_gemini(api_key):
67
  """Configure Gemini AI model."""
68
  try:
69
  genai.configure(api_key=api_key)
70
+ return genai.GenerativeModel(MODEL_NAME)
71
  except Exception as e:
72
  logging.error(f"Error configuring Gemini: {str(e)}")
73
  raise
 
85
  - Do NOT use the current date (today) unless the document explicitly says "Today".
86
  2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND.
87
  3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals.
88
+ 4. For bank statements, focus on transaction rows only.
89
+ 5. Do not duplicate the same transaction.
90
+ 6. If a chunk appears to contain partial rows only, extract only rows that are sufficiently complete.
91
 
92
  FIELDS TO EXTRACT:
93
  - Date: string (DD/MM/YYYY)
 
168
  """
169
  current_date = datetime.now().strftime("%d/%m/%Y")
170
  return f"""IMPORTANT: Today's date is {current_date}.
171
+ If the text below does not specify a year or date, reasonably assume {current_date} context, but prefer explicit dates in text.
172
 
173
  {FINANCIAL_DOC_PROMPT}
174
  """
 
181
  """
182
  Categorizes a transaction based strictly on its Type field.
183
  """
184
+ tx_type = str(transaction.get('Type', '')).lower()
185
+ description = str(transaction.get('Description', '')).lower()
186
+ destination = str(transaction.get('Destination_of_funds', '')).lower()
187
 
188
  account_category = "Uncategorized"
189
 
 
296
  # HELPER FUNCTIONS
297
  # -------------------------------------------------------------------------
298
 
299
+ def normalize_whitespace(text: str) -> str:
300
+ return re.sub(r"[ \t]+", " ", text or "").strip()
301
+
302
  def extract_json_from_response(response_text):
303
  """Extract valid JSON from Gemini's response, handling Markdown fences."""
304
+ cleaned_text = re.sub(r'```json\s*', '', response_text or "", flags=re.IGNORECASE)
305
  cleaned_text = re.sub(r'```\s*', '', cleaned_text)
306
 
307
  match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
 
327
  logging.error(f"JSON repair failed: {e}")
328
  return {fallback_key: []}
329
 
330
+ def is_timeout_error(exc: Exception) -> bool:
331
+ msg = str(exc).lower()
332
+ timeout_markers = [
333
+ "504",
334
+ "timed out",
335
+ "timeout",
336
+ "deadlineexceeded",
337
+ "deadline exceeded",
338
+ "gateway timeout",
339
+ "upstream request timeout",
340
+ ]
341
+ return any(marker in msg for marker in timeout_markers)
342
+
343
+ def is_retryable_error(exc: Exception) -> bool:
344
+ msg = str(exc).lower()
345
+ retryable_markers = [
346
+ "429",
347
+ "resourceexhausted",
348
+ "unavailable",
349
+ "503",
350
+ "500",
351
+ "internal",
352
+ "connection reset",
353
+ "temporarily unavailable",
354
+ "service unavailable",
355
+ "rate limit",
356
+ ]
357
+ return is_timeout_error(exc) or any(marker in msg for marker in retryable_markers)
358
+
359
+ def sleep_for_retry(attempt: int):
360
+ delay = RETRY_BASE_SLEEP_SECONDS * (attempt + 1)
361
+ time.sleep(delay)
362
+
363
+ def post_process_financial_result(result: Dict[str, Any]) -> Dict[str, Any]:
364
+ if 'transactions' in result and isinstance(result['transactions'], list):
365
+ result['transactions'] = [categorize_transaction(tx) for tx in result['transactions']]
366
+ else:
367
+ result['transactions'] = []
368
+ return result
369
+
370
+ def call_gemini_with_retry(model, content, prompt, retries=GEMINI_RETRIES):
371
  """
372
  Generic runner for financial Gemini extraction.
373
+ Retries transient failures including 504s.
374
+ Raises GeminiTimeoutError specifically so caller can fall back to vision.
375
  """
376
+ last_exc = None
377
+
378
  for attempt in range(retries + 1):
379
  try:
380
+ response = model.generate_content(
381
+ [prompt, content],
382
+ generation_config={
383
+ "temperature": 0,
384
+ "response_mime_type": "application/json",
385
+ }
386
+ )
387
 
388
  try:
389
  result = extract_json_from_response(response.text)
390
+ return post_process_financial_result(result)
 
 
 
 
 
 
391
  except ValueError as ve:
392
  broken_json = str(ve)
393
  repaired = repair_json_with_gemini(model, broken_json, fallback_key="transactions")
394
+ return post_process_financial_result(repaired)
395
 
396
+ except Exception as e:
397
+ last_exc = e
398
+ if is_retryable_error(e):
399
+ logging.warning(
400
+ f"Gemini transient error on attempt {attempt + 1}/{retries + 1}: {e}"
401
+ )
402
+ if attempt < retries:
403
+ sleep_for_retry(attempt)
404
+ continue
405
+ if is_timeout_error(e):
406
+ raise GeminiTimeoutError(str(e))
407
+ raise GeminiTransientError(str(e))
408
 
409
+ logging.error(f"Gemini fatal error: {e}")
410
+ raise GeminiFatalError(str(e))
411
 
412
+ if last_exc:
413
+ if is_timeout_error(last_exc):
414
+ raise GeminiTimeoutError(str(last_exc))
415
+ raise GeminiTransientError(str(last_exc))
 
 
 
416
 
417
  return {"transactions": []}
418
 
419
+ def call_gemini_students_with_retry(model, content, prompt, retries=GEMINI_RETRIES):
420
  """
421
  Generic runner for student Gemini extraction.
422
  """
423
+ last_exc = None
424
+
425
  for attempt in range(retries + 1):
426
  try:
427
+ response = model.generate_content(
428
+ [prompt, content],
429
+ generation_config={
430
+ "temperature": 0,
431
+ "response_mime_type": "application/json",
432
+ }
433
+ )
434
 
435
  try:
436
  result = extract_json_from_response(response.text)
 
445
  return repaired
446
 
447
  except Exception as e:
448
+ last_exc = e
449
+ if is_retryable_error(e):
450
+ logging.warning(
451
+ f"Gemini student transient error on attempt {attempt + 1}/{retries + 1}: {e}"
452
+ )
453
+ if attempt < retries:
454
+ sleep_for_retry(attempt)
455
+ continue
456
+ if is_timeout_error(e):
457
+ raise GeminiTimeoutError(str(e))
458
+ raise GeminiTransientError(str(e))
459
+
460
+ logging.error(f"Gemini student import fatal error: {e}")
461
+ raise GeminiFatalError(str(e))
462
+
463
+ if last_exc:
464
+ if is_timeout_error(last_exc):
465
+ raise GeminiTimeoutError(str(last_exc))
466
+ raise GeminiTransientError(str(last_exc))
467
 
468
  return {"students": []}
469
 
 
517
  return cleaned
518
  return []
519
 
520
+ # -------------------------------------------------------------------------
521
+ # PDF / TEXT QUALITY HELPERS
522
+ # -------------------------------------------------------------------------
523
+
524
+ def text_quality_metrics(text: str) -> Dict[str, Any]:
525
+ text = text or ""
526
+ stripped = text.strip()
527
+
528
+ if not stripped:
529
+ return {
530
+ "length": 0,
531
+ "lines": 0,
532
+ "alpha_ratio": 0,
533
+ "digit_ratio": 0,
534
+ "weird_ratio": 1,
535
+ "date_hits": 0,
536
+ "amount_hits": 0,
537
+ "looks_usable": False,
538
+ }
539
+
540
+ length = len(stripped)
541
+ lines = [line.strip() for line in stripped.splitlines() if line.strip()]
542
+ joined = "\n".join(lines)
543
+
544
+ alpha_count = sum(1 for c in joined if c.isalpha())
545
+ digit_count = sum(1 for c in joined if c.isdigit())
546
+ printable_count = sum(1 for c in joined if c.isprintable())
547
+ weird_count = sum(
548
+ 1 for c in joined
549
+ if c.isprintable() and not (c.isalnum() or c.isspace() or c in ".,:/()-_&+'*#")
550
+ )
551
+
552
+ total_chars = max(len(joined), 1)
553
+ alpha_ratio = alpha_count / total_chars
554
+ digit_ratio = digit_count / total_chars
555
+ weird_ratio = weird_count / total_chars
556
+
557
+ date_hits = len(re.findall(r'\b\d{1,2}[/-]\d{1,2}(?:[/-]\d{2,4})?\b', joined))
558
+ amount_hits = len(re.findall(r'\b\d{1,3}(?:[,\s]\d{3})*(?:\.\d{2})\b', joined))
559
+
560
+ looks_usable = (
561
+ length >= TEXT_MIN_MEANINGFUL_LENGTH and
562
+ weird_ratio < 0.20 and
563
+ printable_count > 0 and
564
+ (alpha_ratio > 0.15 or date_hits > 1 or amount_hits > 2)
565
+ )
566
+
567
+ return {
568
+ "length": length,
569
+ "lines": len(lines),
570
+ "alpha_ratio": round(alpha_ratio, 4),
571
+ "digit_ratio": round(digit_ratio, 4),
572
+ "weird_ratio": round(weird_ratio, 4),
573
+ "date_hits": date_hits,
574
+ "amount_hits": amount_hits,
575
+ "looks_usable": looks_usable,
576
+ }
577
+
578
+ def should_use_text_strategy(text: str) -> bool:
579
+ metrics = text_quality_metrics(text)
580
+ return metrics["looks_usable"]
581
+
582
+ def split_text_into_chunks(text: str, max_chars: int = MAX_TEXT_CHUNK_CHARS, max_lines: int = MAX_TEXT_CHUNK_LINES) -> List[str]:
583
+ """
584
+ Chunk large extracted page text to reduce timeout risk.
585
+ Keeps line grouping to preserve statement row structure.
586
+ """
587
+ lines = [line.rstrip() for line in (text or "").splitlines() if line.strip()]
588
+ if not lines:
589
+ return []
590
+
591
+ chunks = []
592
+ current_lines = []
593
+ current_len = 0
594
+
595
+ for line in lines:
596
+ proposed_len = current_len + len(line) + 1
597
+ if current_lines and (proposed_len > max_chars or len(current_lines) >= max_lines):
598
+ chunks.append("\n".join(current_lines))
599
+ current_lines = [line]
600
+ current_len = len(line) + 1
601
+ else:
602
+ current_lines.append(line)
603
+ current_len = proposed_len
604
+
605
+ if current_lines:
606
+ chunks.append("\n".join(current_lines))
607
+
608
+ return chunks
609
+
610
+ def normalize_transaction(tx: Dict[str, Any]) -> Dict[str, Any]:
611
+ normalized = {
612
+ "Date": str(tx.get("Date", "")).strip(),
613
+ "Description": normalize_whitespace(str(tx.get("Description", "")).strip()),
614
+ "Customer_name": normalize_whitespace(str(tx.get("Customer_name", "N/A")).strip() or "N/A"),
615
+ "City": normalize_whitespace(str(tx.get("City", "N/A")).strip() or "N/A"),
616
+ "Amount": tx.get("Amount", 0),
617
+ "Type": str(tx.get("Type", "")).strip(),
618
+ "Destination_of_funds": normalize_whitespace(str(tx.get("Destination_of_funds", "")).strip()),
619
+ "Document_Type": str(tx.get("Document_Type", "")).strip(),
620
+ "Account_Category": str(tx.get("Account_Category", "")).strip(),
621
+ }
622
+
623
+ try:
624
+ normalized["Amount"] = float(normalized["Amount"])
625
+ except Exception:
626
+ normalized["Amount"] = 0.0
627
+
628
+ return normalized
629
+
630
+ def dedupe_transactions(transactions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
631
+ seen = set()
632
+ unique = []
633
+
634
+ for tx in transactions:
635
+ item = normalize_transaction(tx)
636
+ key = (
637
+ item.get("Date", "").lower(),
638
+ item.get("Description", "").lower(),
639
+ round(float(item.get("Amount", 0) or 0), 2),
640
+ item.get("Type", "").lower(),
641
+ item.get("Document_Type", "").lower(),
642
+ )
643
+ if key in seen:
644
+ continue
645
+ seen.add(key)
646
+ unique.append(item)
647
+
648
+ return unique
649
+
650
+ # -------------------------------------------------------------------------
651
+ # STUDENT HELPERS
652
+ # -------------------------------------------------------------------------
653
+
654
  def build_student_prompt(template_fields=None):
655
  template_fields = template_fields or {}
656
 
 
704
  mapped[canonical] = str(normalized_raw[alias]).strip()
705
  break
706
 
707
+ alias_flat = {a for aliases in alias_map.values() for a in aliases}
708
+
709
  for key, value in normalized_raw.items():
710
+ if key in alias_flat:
711
  continue
712
  if key == "extra_fields":
713
  continue
 
891
  except Exception:
892
  text_content = ""
893
 
894
+ if should_use_text_strategy(text_content):
895
+ chunks = split_text_into_chunks(text_content)
896
+ for chunk in chunks:
897
+ result = call_gemini_students_with_retry(model, chunk, prompt)
898
+ page_students = result.get('students', []) or []
899
+ all_students.extend(page_students)
900
  else:
901
  if PDF_IMAGE_SUPPORT:
902
  page_students = process_student_pdf_page_as_image(
 
948
  return parse_students_from_dataframe(df, template_fields=template_fields)
949
 
950
  # -------------------------------------------------------------------------
951
+ # CORE LOGIC: PDF PROCESSING (TEXT + CHUNKING + VISION FALLBACK)
952
  # -------------------------------------------------------------------------
953
 
954
  def process_pdf_page_as_image(model, pdf_path, page_num):
 
963
  result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
964
  return result.get('transactions', [])
965
 
966
+ def process_text_chunks_for_transactions(model, text_content: str) -> List[Dict[str, Any]]:
967
+ """
968
+ Split large text into smaller chunks before sending to Gemini.
969
+ This lowers timeout risk for dense bank statement pages.
970
+ """
971
+ chunks = split_text_into_chunks(text_content)
972
+ if not chunks:
973
+ return []
974
+
975
+ all_transactions = []
976
+ for idx, chunk in enumerate(chunks, start=1):
977
+ logging.info(f"Processing text chunk {idx}/{len(chunks)}")
978
+ result = call_gemini_with_retry(model, chunk, FINANCIAL_DOC_PROMPT)
979
+ all_transactions.extend(result.get('transactions', []))
980
+
981
+ return dedupe_transactions(all_transactions)
982
+
983
+ def process_pdf_page_with_fallback(model, pdf_path: str, page_num: int, text_content: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
984
+ """
985
+ Page pipeline:
986
+ 1. If text quality is good, try chunked text strategy.
987
+ 2. On repeated 504/timeout/transient failure, fall back to vision for that page.
988
+ 3. If text quality is poor, go directly to vision.
989
+ """
990
+ metrics = text_quality_metrics(text_content)
991
+ page_summary = {
992
+ "page": page_num,
993
+ "text_quality": metrics,
994
+ "strategy_used": None,
995
+ "fallback_triggered": False,
996
+ "transactions_extracted": 0,
997
+ "status": "pending",
998
+ "error": None,
999
+ }
1000
+
1001
+ try:
1002
+ if should_use_text_strategy(text_content):
1003
+ page_summary["strategy_used"] = "text"
1004
+ logging.info(f"Page {page_num}: using chunked text strategy")
1005
+ try:
1006
+ txs = process_text_chunks_for_transactions(model, text_content)
1007
+ page_summary["transactions_extracted"] = len(txs)
1008
+ page_summary["status"] = "processed"
1009
+ return txs, page_summary
1010
+ except (GeminiTimeoutError, GeminiTransientError) as e:
1011
+ logging.warning(f"Page {page_num}: text strategy failed, falling back to vision. Error: {e}")
1012
+ page_summary["fallback_triggered"] = True
1013
+ page_summary["error"] = f"text_strategy_failed: {str(e)}"
1014
+
1015
+ if PDF_IMAGE_SUPPORT:
1016
+ page_summary["strategy_used"] = "vision_fallback_after_text_failure"
1017
+ txs = process_pdf_page_as_image(model, pdf_path, page_num)
1018
+ txs = dedupe_transactions(txs)
1019
+ page_summary["transactions_extracted"] = len(txs)
1020
+ page_summary["status"] = "processed"
1021
+ return txs, page_summary
1022
+ else:
1023
+ page_summary["status"] = "failed"
1024
+ page_summary["error"] = f"{page_summary['error']} | vision unavailable"
1025
+ return [], page_summary
1026
+ else:
1027
+ logging.info(f"Page {page_num}: poor/low text quality. Using vision strategy directly.")
1028
+ page_summary["strategy_used"] = "vision_direct"
1029
+ if PDF_IMAGE_SUPPORT:
1030
+ txs = process_pdf_page_as_image(model, pdf_path, page_num)
1031
+ txs = dedupe_transactions(txs)
1032
+ page_summary["transactions_extracted"] = len(txs)
1033
+ page_summary["status"] = "processed"
1034
+ return txs, page_summary
1035
+ else:
1036
+ page_summary["status"] = "failed"
1037
+ page_summary["error"] = "low-quality text and vision unavailable"
1038
+ return [], page_summary
1039
+
1040
+ except Exception as e:
1041
+ page_summary["status"] = "failed"
1042
+ page_summary["error"] = str(e)
1043
+ logging.error(f"Page {page_num}: final failure - {e}")
1044
+ return [], page_summary
1045
+
1046
+ # -------------------------------------------------------------------------
1047
+ # ROOT / HEALTH
1048
+ # -------------------------------------------------------------------------
1049
+
1050
+ @app.route('/', methods=['GET'])
1051
+ def index():
1052
+ return jsonify({
1053
+ "message": "API is running",
1054
+ "version": "2.4.0",
1055
+ "model": MODEL_NAME,
1056
+ "vision_support": PDF_IMAGE_SUPPORT,
1057
+ "endpoints": [
1058
+ "/health",
1059
+ "/process-pdf",
1060
+ "/process-text",
1061
+ "/process-image",
1062
+ "/transaction-types",
1063
+ "/api/customers/parse-students-images",
1064
+ "/api/customers/validate-students-import",
1065
+ "/api/customers/parse-students-manual"
1066
+ ]
1067
+ })
1068
+
1069
+ @app.route('/health', methods=['GET'])
1070
+ def health_check():
1071
+ return jsonify({
1072
+ 'status': 'healthy',
1073
+ 'timestamp': datetime.now().isoformat(),
1074
+ 'version': '2.4.0',
1075
+ 'vision_support': PDF_IMAGE_SUPPORT,
1076
+ 'model': MODEL_NAME
1077
+ })
1078
+
1079
+ # -------------------------------------------------------------------------
1080
+ # FINANCIAL ENDPOINTS
1081
+ # -------------------------------------------------------------------------
1082
+
1083
  @app.route('/process-pdf', methods=['POST'])
1084
  def process_pdf():
1085
  """
1086
  Smart PDF Processor:
1087
  1. Checks if empty.
1088
+ 2. Tries text extraction per page.
1089
+ 3. Uses text-quality gating.
1090
+ 4. Chunk-processes large text pages.
1091
+ 5. On repeated 504/timeout/transient errors, falls back to Vision for that page.
1092
+ 6. Returns partial success instead of failing the whole document for one bad page.
1093
  """
1094
  temp_path = None
1095
  try:
1096
  if 'file' not in request.files:
1097
  return jsonify({'error': 'No file uploaded'}), 400
1098
+
1099
  file = request.files['file']
1100
  if file.filename == '':
1101
  return jsonify({'error': 'No file selected'}), 400
 
1107
  if is_file_empty(temp_path):
1108
  return jsonify({'error': 'Uploaded file is empty'}), 400
1109
 
1110
+ if not api_key:
1111
+ return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
1112
+
1113
  model = configure_gemini(api_key)
1114
  all_transactions = []
1115
+ pages_summary = []
1116
+ failed_pages = []
1117
 
1118
  try:
1119
  reader = pypdf.PdfReader(temp_path)
1120
  num_pages = len(reader.pages)
1121
 
1122
  for i in range(num_pages):
1123
+ page_num = i + 1
1124
+ logging.info(f"Processing page {page_num}/{num_pages}")
1125
 
1126
  try:
1127
+ text_content = reader.pages[i].extract_text() or ""
1128
+ except Exception as e:
1129
+ logging.warning(f"Page {page_num}: text extraction failed: {e}")
1130
  text_content = ""
1131
 
1132
+ txs, page_summary = process_pdf_page_with_fallback(
1133
+ model=model,
1134
+ pdf_path=temp_path,
1135
+ page_num=page_num,
1136
+ text_content=text_content
1137
+ )
 
 
 
 
 
 
1138
 
 
1139
  all_transactions.extend(txs)
1140
+ pages_summary.append(page_summary)
1141
+
1142
+ if page_summary["status"] != "processed":
1143
+ failed_pages.append({
1144
+ "page": page_num,
1145
+ "error": page_summary.get("error")
1146
+ })
1147
 
1148
  except pypdf.errors.PdfReadError:
1149
  logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
1150
  if PDF_IMAGE_SUPPORT:
1151
  images = convert_from_path(temp_path)
1152
+ for idx, img in enumerate(images, start=1):
1153
+ page_summary = {
1154
+ "page": idx,
1155
+ "text_quality": None,
1156
+ "strategy_used": "vision_full_pdf_fallback",
1157
+ "fallback_triggered": True,
1158
+ "transactions_extracted": 0,
1159
+ "status": "pending",
1160
+ "error": None,
1161
+ }
1162
+ try:
1163
+ result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
1164
+ txs = dedupe_transactions(result.get('transactions', []))
1165
+ all_transactions.extend(txs)
1166
+ page_summary["transactions_extracted"] = len(txs)
1167
+ page_summary["status"] = "processed"
1168
+ except Exception as e:
1169
+ page_summary["status"] = "failed"
1170
+ page_summary["error"] = str(e)
1171
+ failed_pages.append({
1172
+ "page": idx,
1173
+ "error": str(e)
1174
+ })
1175
+ pages_summary.append(page_summary)
1176
  else:
1177
  raise ValueError("PDF is unreadable and Vision fallback is unavailable.")
1178
 
1179
+ all_transactions = dedupe_transactions(all_transactions)
1180
+
1181
+ total_pages = len(pages_summary)
1182
+ processed_pages = len([p for p in pages_summary if p["status"] == "processed"])
1183
+ total_failed_pages = len([p for p in pages_summary if p["status"] != "processed"])
1184
+
1185
+ response_payload = {
1186
+ 'transactions': all_transactions,
1187
+ 'summary': {
1188
+ 'pages_total': total_pages,
1189
+ 'pages_processed': processed_pages,
1190
+ 'pages_failed': total_failed_pages,
1191
+ 'transactions_total': len(all_transactions),
1192
+ 'partial_success': total_failed_pages > 0 and processed_pages > 0,
1193
+ 'success': processed_pages > 0
1194
+ },
1195
+ 'pages': pages_summary,
1196
+ 'failed_pages': failed_pages
1197
+ }
1198
+
1199
+ if processed_pages == 0:
1200
+ return jsonify(response_payload), 500
1201
+
1202
+ return jsonify(response_payload), 200
1203
 
1204
  except Exception as e:
1205
  logging.error(f"Server Error: {e}")
 
1208
  if temp_path and os.path.exists(temp_path):
1209
  os.remove(temp_path)
1210
 
 
 
 
 
1211
  @app.route('/process-text', methods=['POST'])
1212
  def process_text():
1213
  """Handle raw text input."""
 
1217
  return jsonify({'error': 'No text provided'}), 400
1218
 
1219
  text_input = data['text']
1220
+ if not str(text_input).strip():
1221
  return jsonify({'error': 'Text input cannot be empty'}), 400
1222
 
1223
+ if not api_key:
1224
+ return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
1225
+
1226
  model = configure_gemini(api_key)
1227
  prompt = get_text_prompt_with_fallback_date()
1228
 
1229
+ chunks = split_text_into_chunks(text_input)
1230
+ all_transactions = []
1231
+
1232
+ for chunk in chunks or [text_input]:
1233
+ result = call_gemini_with_retry(model, chunk, prompt)
1234
+ all_transactions.extend(result.get('transactions', []))
1235
+
1236
+ return jsonify({'transactions': dedupe_transactions(all_transactions)})
1237
 
1238
  except Exception as e:
1239
  logging.error(f"Error: {e}")
 
1258
  file.save(tmp.name)
1259
  temp_path = tmp.name
1260
 
1261
+ if not api_key:
1262
+ return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
1263
+
1264
  model = configure_gemini(api_key)
1265
  img = Image.open(temp_path)
1266
  result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
1267
 
1268
+ return jsonify({'transactions': dedupe_transactions(result.get('transactions', []))})
1269
 
1270
  except Exception as e:
1271
  logging.error(f"Error: {e}")
 
1299
  if not uploaded_files:
1300
  return jsonify({"error": "No files uploaded"}), 400
1301
 
1302
+ if not api_key:
1303
+ return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
1304
+
1305
  template_fields = parse_json_safely(request.form.get("template_fields"), default={})
1306
  model = configure_gemini(api_key)
1307
 
 
1528
  }
1529
  return jsonify(transaction_types)
1530
 
1531
+ # -------------------------------------------------------------------------
1532
+ # MAIN
1533
+ # -------------------------------------------------------------------------
 
 
 
 
 
1534
 
1535
  if __name__ == '__main__':
1536
  app.run(debug=True, host="0.0.0.0", port=7860)