Spaces:

rairo
/

stmt-api

Running

App Files Files Community

rairo commited on Apr 23

Commit

3a70f45

verified ·

1 Parent(s): 179236b

Update main.py

Browse files

Files changed (1) hide show

main.py +502 -79

main.py CHANGED Viewed

@@ -6,6 +6,7 @@ import tempfile
 import time
 from datetime import datetime
 from io import BytesIO
 # Third-party imports
 from flask import Flask, request, jsonify
@@ -34,11 +35,39 @@ api_key = os.getenv('Gemini')
 if not api_key:
     logging.warning("Gemini API key not found in environment variables.")
 def configure_gemini(api_key):
     """Configure Gemini AI model."""
     try:
         genai.configure(api_key=api_key)
-        return genai.GenerativeModel('gemini-2.5-flash')
     except Exception as e:
         logging.error(f"Error configuring Gemini: {str(e)}")
         raise
@@ -56,6 +85,9 @@ RULES:
    - Do NOT use the current date (today) unless the document explicitly says "Today".
 2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND.
 3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals.
 FIELDS TO EXTRACT:
 - Date: string (DD/MM/YYYY)
@@ -136,7 +168,7 @@ def get_text_prompt_with_fallback_date():
     """
     current_date = datetime.now().strftime("%d/%m/%Y")
     return f"""IMPORTANT: Today's date is {current_date}.
-If the text below does not specify a year or date, reasonable assume {current_date} context, but prefer explicit dates in text.
 {FINANCIAL_DOC_PROMPT}
 """
@@ -149,9 +181,9 @@ def categorize_transaction(transaction):
     """
     Categorizes a transaction based strictly on its Type field.
     """
-    tx_type = transaction.get('Type', '').lower()
-    description = transaction.get('Description', '').lower()
-    destination = transaction.get('Destination_of_funds', '').lower()
     account_category = "Uncategorized"
@@ -264,9 +296,12 @@ def categorize_transaction(transaction):
 # HELPER FUNCTIONS
 # -------------------------------------------------------------------------
 def extract_json_from_response(response_text):
     """Extract valid JSON from Gemini's response, handling Markdown fences."""
-    cleaned_text = re.sub(r'```json\s*', '', response_text)
     cleaned_text = re.sub(r'```\s*', '', cleaned_text)
     match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
@@ -292,51 +327,110 @@ Broken JSON: {broken_json_string}"""
         logging.error(f"JSON repair failed: {e}")
         return {fallback_key: []}
-def call_gemini_with_retry(model, content, prompt, retries=2):
     """
     Generic runner for financial Gemini extraction.
     """
     for attempt in range(retries + 1):
         try:
-            response = model.generate_content([prompt, content])
             try:
                 result = extract_json_from_response(response.text)
-                if 'transactions' in result:
-                    result['transactions'] = [
-                        categorize_transaction(tx) for tx in result['transactions']
-                    ]
-                return result
             except ValueError as ve:
                 broken_json = str(ve)
                 repaired = repair_json_with_gemini(model, broken_json, fallback_key="transactions")
-                if 'transactions' in repaired:
-                    repaired['transactions'] = [
-                        categorize_transaction(tx) for tx in repaired['transactions']
-                    ]
-                return repaired
-        except Exception as e:
-            if "429" in str(e) or "ResourceExhausted" in str(e):
-                time.sleep(2 * (attempt + 1))
-                continue
-            logging.error(f"Gemini Error: {e}")
-            if attempt == retries:
-                raise
     return {"transactions": []}
-def call_gemini_students_with_retry(model, content, prompt, retries=2):
     """
     Generic runner for student Gemini extraction.
     """
     for attempt in range(retries + 1):
         try:
-            response = model.generate_content([prompt, content])
             try:
                 result = extract_json_from_response(response.text)
@@ -351,12 +445,25 @@ def call_gemini_students_with_retry(model, content, prompt, retries=2):
                 return repaired
         except Exception as e:
-            if "429" in str(e) or "ResourceExhausted" in str(e):
-                time.sleep(2 * (attempt + 1))
-                continue
-            logging.error(f"Gemini Student Import Error: {e}")
-            if attempt == retries:
-                raise
     return {"students": []}
@@ -410,6 +517,140 @@ def ensure_extra_fields_list(value):
         return cleaned
     return []
 def build_student_prompt(template_fields=None):
     template_fields = template_fields or {}
@@ -463,8 +704,10 @@ def normalize_student_record(student, template_fields=None, sequence=None):
                 mapped[canonical] = str(normalized_raw[alias]).strip()
                 break
     for key, value in normalized_raw.items():
-        if key in {a for aliases in alias_map.values() for a in aliases}:
             continue
         if key == "extra_fields":
             continue
@@ -648,10 +891,12 @@ def parse_students_from_pdf(model, pdf_path, template_fields=None):
             except Exception:
                 text_content = ""
-            if text_content and len(text_content.strip()) > 50:
-                result = call_gemini_students_with_retry(model, text_content, prompt)
-                page_students = result.get('students', []) or []
-                all_students.extend(page_students)
             else:
                 if PDF_IMAGE_SUPPORT:
                     page_students = process_student_pdf_page_as_image(
@@ -703,7 +948,7 @@ def read_spreadsheet_students(file_path, filename, template_fields=None):
     return parse_students_from_dataframe(df, template_fields=template_fields)
 # -------------------------------------------------------------------------
-# CORE LOGIC: PDF PROCESSING (HYBRID TEXT + VISION)
 # -------------------------------------------------------------------------
 def process_pdf_page_as_image(model, pdf_path, page_num):
@@ -718,18 +963,139 @@ def process_pdf_page_as_image(model, pdf_path, page_num):
     result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
     return result.get('transactions', [])
 @app.route('/process-pdf', methods=['POST'])
 def process_pdf():
     """
     Smart PDF Processor:
     1. Checks if empty.
-    2. Tries standard Text extraction.
-    3. If Text fails or is empty, falls back to Vision.
     """
     temp_path = None
     try:
         if 'file' not in request.files:
             return jsonify({'error': 'No file uploaded'}), 400
         file = request.files['file']
         if file.filename == '':
             return jsonify({'error': 'No file selected'}), 400
@@ -741,48 +1107,99 @@ def process_pdf():
         if is_file_empty(temp_path):
             return jsonify({'error': 'Uploaded file is empty'}), 400
         model = configure_gemini(api_key)
         all_transactions = []
         try:
             reader = pypdf.PdfReader(temp_path)
             num_pages = len(reader.pages)
             for i in range(num_pages):
-                logging.info(f"Processing page {i+1}/{num_pages}")
                 try:
-                    text_content = reader.pages[i].extract_text()
-                except Exception:
                     text_content = ""
-                if text_content and len(text_content.strip()) > 50:
-                    logging.info("Text detected. Using Text Strategy.")
-                    result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT)
-                else:
-                    logging.info("Low text/Encryption detected. Switching to Vision Strategy.")
-                    if PDF_IMAGE_SUPPORT:
-                        txs = process_pdf_page_as_image(model, temp_path, i + 1)
-                        all_transactions.extend(txs)
-                        continue
-                    else:
-                        logging.warning("Cannot process scanned PDF - pdf2image missing.")
-                        result = {"transactions": []}
-                txs = result.get('transactions', [])
                 all_transactions.extend(txs)
         except pypdf.errors.PdfReadError:
             logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
             if PDF_IMAGE_SUPPORT:
                 images = convert_from_path(temp_path)
-                for img in images:
-                    result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
-                    all_transactions.extend(result.get('transactions', []))
             else:
                 raise ValueError("PDF is unreadable and Vision fallback is unavailable.")
-        return jsonify({'transactions': all_transactions})
     except Exception as e:
         logging.error(f"Server Error: {e}")
@@ -791,10 +1208,6 @@ def process_pdf():
         if temp_path and os.path.exists(temp_path):
             os.remove(temp_path)
-# -------------------------------------------------------------------------
-# TEXT & IMAGE ENDPOINTS
-# -------------------------------------------------------------------------
 @app.route('/process-text', methods=['POST'])
 def process_text():
     """Handle raw text input."""
@@ -804,14 +1217,23 @@ def process_text():
             return jsonify({'error': 'No text provided'}), 400
         text_input = data['text']
-        if not text_input.strip():
             return jsonify({'error': 'Text input cannot be empty'}), 400
         model = configure_gemini(api_key)
         prompt = get_text_prompt_with_fallback_date()
-        result = call_gemini_with_retry(model, text_input, prompt)
-        return jsonify({'transactions': result.get('transactions', [])})
     except Exception as e:
         logging.error(f"Error: {e}")
@@ -836,11 +1258,14 @@ def process_image():
             file.save(tmp.name)
             temp_path = tmp.name
         model = configure_gemini(api_key)
         img = Image.open(temp_path)
         result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
-        return jsonify({'transactions': result.get('transactions', [])})
     except Exception as e:
         logging.error(f"Error: {e}")
@@ -874,6 +1299,9 @@ def parse_students_images():
         if not uploaded_files:
             return jsonify({"error": "No files uploaded"}), 400
         template_fields = parse_json_safely(request.form.get("template_fields"), default={})
         model = configure_gemini(api_key)
@@ -1100,14 +1528,9 @@ def get_transaction_types():
     }
     return jsonify(transaction_types)
-@app.route('/health', methods=['GET'])
-def health_check():
-    return jsonify({
-        'status': 'healthy',
-        'timestamp': datetime.now().isoformat(),
-        'version': '2.3.0',
-        'vision_support': PDF_IMAGE_SUPPORT
-    })
 if __name__ == '__main__':
     app.run(debug=True, host="0.0.0.0", port=7860)

 import time
 from datetime import datetime
 from io import BytesIO
+from typing import List, Dict, Any, Tuple
 # Third-party imports
 from flask import Flask, request, jsonify
 if not api_key:
     logging.warning("Gemini API key not found in environment variables.")
+# -------------------------------------------------------------------------
+# CONSTANTS
+# -------------------------------------------------------------------------
+MODEL_NAME = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
+MAX_TEXT_CHUNK_CHARS = int(os.getenv("MAX_TEXT_CHUNK_CHARS", "12000"))
+MAX_TEXT_CHUNK_LINES = int(os.getenv("MAX_TEXT_CHUNK_LINES", "120"))
+GEMINI_RETRIES = int(os.getenv("GEMINI_RETRIES", "3"))
+RETRY_BASE_SLEEP_SECONDS = float(os.getenv("RETRY_BASE_SLEEP_SECONDS", "2"))
+TEXT_MIN_MEANINGFUL_LENGTH = int(os.getenv("TEXT_MIN_MEANINGFUL_LENGTH", "80"))
+# -------------------------------------------------------------------------
+# CUSTOM EXCEPTIONS
+# -------------------------------------------------------------------------
+class GeminiTransientError(Exception):
+    pass
+class GeminiTimeoutError(GeminiTransientError):
+    pass
+class GeminiFatalError(Exception):
+    pass
+# -------------------------------------------------------------------------
+# GEMINI SETUP
+# -------------------------------------------------------------------------
 def configure_gemini(api_key):
     """Configure Gemini AI model."""
     try:
         genai.configure(api_key=api_key)
+        return genai.GenerativeModel(MODEL_NAME)
     except Exception as e:
         logging.error(f"Error configuring Gemini: {str(e)}")
         raise
    - Do NOT use the current date (today) unless the document explicitly says "Today".
 2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND.
 3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals.
+4. For bank statements, focus on transaction rows only.
+5. Do not duplicate the same transaction.
+6. If a chunk appears to contain partial rows only, extract only rows that are sufficiently complete.
 FIELDS TO EXTRACT:
 - Date: string (DD/MM/YYYY)
     """
     current_date = datetime.now().strftime("%d/%m/%Y")
     return f"""IMPORTANT: Today's date is {current_date}.
+If the text below does not specify a year or date, reasonably assume {current_date} context, but prefer explicit dates in text.
 {FINANCIAL_DOC_PROMPT}
 """
     """
     Categorizes a transaction based strictly on its Type field.
     """
+    tx_type = str(transaction.get('Type', '')).lower()
+    description = str(transaction.get('Description', '')).lower()
+    destination = str(transaction.get('Destination_of_funds', '')).lower()
     account_category = "Uncategorized"
 # HELPER FUNCTIONS
 # -------------------------------------------------------------------------
+def normalize_whitespace(text: str) -> str:
+    return re.sub(r"[ \t]+", " ", text or "").strip()
 def extract_json_from_response(response_text):
     """Extract valid JSON from Gemini's response, handling Markdown fences."""
+    cleaned_text = re.sub(r'```json\s*', '', response_text or "", flags=re.IGNORECASE)
     cleaned_text = re.sub(r'```\s*', '', cleaned_text)
     match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
         logging.error(f"JSON repair failed: {e}")
         return {fallback_key: []}
+def is_timeout_error(exc: Exception) -> bool:
+    msg = str(exc).lower()
+    timeout_markers = [
+        "504",
+        "timed out",
+        "timeout",
+        "deadlineexceeded",
+        "deadline exceeded",
+        "gateway timeout",
+        "upstream request timeout",
+    ]
+    return any(marker in msg for marker in timeout_markers)
+def is_retryable_error(exc: Exception) -> bool:
+    msg = str(exc).lower()
+    retryable_markers = [
+        "429",
+        "resourceexhausted",
+        "unavailable",
+        "503",
+        "500",
+        "internal",
+        "connection reset",
+        "temporarily unavailable",
+        "service unavailable",
+        "rate limit",
+    ]
+    return is_timeout_error(exc) or any(marker in msg for marker in retryable_markers)
+def sleep_for_retry(attempt: int):
+    delay = RETRY_BASE_SLEEP_SECONDS * (attempt + 1)
+    time.sleep(delay)
+def post_process_financial_result(result: Dict[str, Any]) -> Dict[str, Any]:
+    if 'transactions' in result and isinstance(result['transactions'], list):
+        result['transactions'] = [categorize_transaction(tx) for tx in result['transactions']]
+    else:
+        result['transactions'] = []
+    return result
+def call_gemini_with_retry(model, content, prompt, retries=GEMINI_RETRIES):
     """
     Generic runner for financial Gemini extraction.
+    Retries transient failures including 504s.
+    Raises GeminiTimeoutError specifically so caller can fall back to vision.
     """
+    last_exc = None
     for attempt in range(retries + 1):
         try:
+            response = model.generate_content(
+                [prompt, content],
+                generation_config={
+                    "temperature": 0,
+                    "response_mime_type": "application/json",
+                }
+            )
             try:
                 result = extract_json_from_response(response.text)
+                return post_process_financial_result(result)
             except ValueError as ve:
                 broken_json = str(ve)
                 repaired = repair_json_with_gemini(model, broken_json, fallback_key="transactions")
+                return post_process_financial_result(repaired)
+        except Exception as e:
+            last_exc = e
+            if is_retryable_error(e):
+                logging.warning(
+                    f"Gemini transient error on attempt {attempt + 1}/{retries + 1}: {e}"
+                )
+                if attempt < retries:
+                    sleep_for_retry(attempt)
+                    continue
+                if is_timeout_error(e):
+                    raise GeminiTimeoutError(str(e))
+                raise GeminiTransientError(str(e))
+            logging.error(f"Gemini fatal error: {e}")
+            raise GeminiFatalError(str(e))
+    if last_exc:
+        if is_timeout_error(last_exc):
+            raise GeminiTimeoutError(str(last_exc))
+        raise GeminiTransientError(str(last_exc))
     return {"transactions": []}
+def call_gemini_students_with_retry(model, content, prompt, retries=GEMINI_RETRIES):
     """
     Generic runner for student Gemini extraction.
     """
+    last_exc = None
     for attempt in range(retries + 1):
         try:
+            response = model.generate_content(
+                [prompt, content],
+                generation_config={
+                    "temperature": 0,
+                    "response_mime_type": "application/json",
+                }
+            )
             try:
                 result = extract_json_from_response(response.text)
                 return repaired
         except Exception as e:
+            last_exc = e
+            if is_retryable_error(e):
+                logging.warning(
+                    f"Gemini student transient error on attempt {attempt + 1}/{retries + 1}: {e}"
+                )
+                if attempt < retries:
+                    sleep_for_retry(attempt)
+                    continue
+                if is_timeout_error(e):
+                    raise GeminiTimeoutError(str(e))
+                raise GeminiTransientError(str(e))
+            logging.error(f"Gemini student import fatal error: {e}")
+            raise GeminiFatalError(str(e))
+    if last_exc:
+        if is_timeout_error(last_exc):
+            raise GeminiTimeoutError(str(last_exc))
+        raise GeminiTransientError(str(last_exc))
     return {"students": []}
         return cleaned
     return []
+# -------------------------------------------------------------------------
+# PDF / TEXT QUALITY HELPERS
+# -------------------------------------------------------------------------
+def text_quality_metrics(text: str) -> Dict[str, Any]:
+    text = text or ""
+    stripped = text.strip()
+    if not stripped:
+        return {
+            "length": 0,
+            "lines": 0,
+            "alpha_ratio": 0,
+            "digit_ratio": 0,
+            "weird_ratio": 1,
+            "date_hits": 0,
+            "amount_hits": 0,
+            "looks_usable": False,
+        }
+    length = len(stripped)
+    lines = [line.strip() for line in stripped.splitlines() if line.strip()]
+    joined = "\n".join(lines)
+    alpha_count = sum(1 for c in joined if c.isalpha())
+    digit_count = sum(1 for c in joined if c.isdigit())
+    printable_count = sum(1 for c in joined if c.isprintable())
+    weird_count = sum(
+        1 for c in joined
+        if c.isprintable() and not (c.isalnum() or c.isspace() or c in ".,:/()-_&+'*#")
+    )
+    total_chars = max(len(joined), 1)
+    alpha_ratio = alpha_count / total_chars
+    digit_ratio = digit_count / total_chars
+    weird_ratio = weird_count / total_chars
+    date_hits = len(re.findall(r'\b\d{1,2}[/-]\d{1,2}(?:[/-]\d{2,4})?\b', joined))
+    amount_hits = len(re.findall(r'\b\d{1,3}(?:[,\s]\d{3})*(?:\.\d{2})\b', joined))
+    looks_usable = (
+        length >= TEXT_MIN_MEANINGFUL_LENGTH and
+        weird_ratio < 0.20 and
+        printable_count > 0 and
+        (alpha_ratio > 0.15 or date_hits > 1 or amount_hits > 2)
+    )
+    return {
+        "length": length,
+        "lines": len(lines),
+        "alpha_ratio": round(alpha_ratio, 4),
+        "digit_ratio": round(digit_ratio, 4),
+        "weird_ratio": round(weird_ratio, 4),
+        "date_hits": date_hits,
+        "amount_hits": amount_hits,
+        "looks_usable": looks_usable,
+    }
+def should_use_text_strategy(text: str) -> bool:
+    metrics = text_quality_metrics(text)
+    return metrics["looks_usable"]
+def split_text_into_chunks(text: str, max_chars: int = MAX_TEXT_CHUNK_CHARS, max_lines: int = MAX_TEXT_CHUNK_LINES) -> List[str]:
+    """
+    Chunk large extracted page text to reduce timeout risk.
+    Keeps line grouping to preserve statement row structure.
+    """
+    lines = [line.rstrip() for line in (text or "").splitlines() if line.strip()]
+    if not lines:
+        return []
+    chunks = []
+    current_lines = []
+    current_len = 0
+    for line in lines:
+        proposed_len = current_len + len(line) + 1
+        if current_lines and (proposed_len > max_chars or len(current_lines) >= max_lines):
+            chunks.append("\n".join(current_lines))
+            current_lines = [line]
+            current_len = len(line) + 1
+        else:
+            current_lines.append(line)
+            current_len = proposed_len
+    if current_lines:
+        chunks.append("\n".join(current_lines))
+    return chunks
+def normalize_transaction(tx: Dict[str, Any]) -> Dict[str, Any]:
+    normalized = {
+        "Date": str(tx.get("Date", "")).strip(),
+        "Description": normalize_whitespace(str(tx.get("Description", "")).strip()),
+        "Customer_name": normalize_whitespace(str(tx.get("Customer_name", "N/A")).strip() or "N/A"),
+        "City": normalize_whitespace(str(tx.get("City", "N/A")).strip() or "N/A"),
+        "Amount": tx.get("Amount", 0),
+        "Type": str(tx.get("Type", "")).strip(),
+        "Destination_of_funds": normalize_whitespace(str(tx.get("Destination_of_funds", "")).strip()),
+        "Document_Type": str(tx.get("Document_Type", "")).strip(),
+        "Account_Category": str(tx.get("Account_Category", "")).strip(),
+    }
+    try:
+        normalized["Amount"] = float(normalized["Amount"])
+    except Exception:
+        normalized["Amount"] = 0.0
+    return normalized
+def dedupe_transactions(transactions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+    seen = set()
+    unique = []
+    for tx in transactions:
+        item = normalize_transaction(tx)
+        key = (
+            item.get("Date", "").lower(),
+            item.get("Description", "").lower(),
+            round(float(item.get("Amount", 0) or 0), 2),
+            item.get("Type", "").lower(),
+            item.get("Document_Type", "").lower(),
+        )
+        if key in seen:
+            continue
+        seen.add(key)
+        unique.append(item)
+    return unique
+# -------------------------------------------------------------------------
+# STUDENT HELPERS
+# -------------------------------------------------------------------------
 def build_student_prompt(template_fields=None):
     template_fields = template_fields or {}
                 mapped[canonical] = str(normalized_raw[alias]).strip()
                 break
+    alias_flat = {a for aliases in alias_map.values() for a in aliases}
     for key, value in normalized_raw.items():
+        if key in alias_flat:
             continue
         if key == "extra_fields":
             continue
             except Exception:
                 text_content = ""
+            if should_use_text_strategy(text_content):
+                chunks = split_text_into_chunks(text_content)
+                for chunk in chunks:
+                    result = call_gemini_students_with_retry(model, chunk, prompt)
+                    page_students = result.get('students', []) or []
+                    all_students.extend(page_students)
             else:
                 if PDF_IMAGE_SUPPORT:
                     page_students = process_student_pdf_page_as_image(
     return parse_students_from_dataframe(df, template_fields=template_fields)
 # -------------------------------------------------------------------------
+# CORE LOGIC: PDF PROCESSING (TEXT + CHUNKING + VISION FALLBACK)
 # -------------------------------------------------------------------------
 def process_pdf_page_as_image(model, pdf_path, page_num):
     result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
     return result.get('transactions', [])
+def process_text_chunks_for_transactions(model, text_content: str) -> List[Dict[str, Any]]:
+    """
+    Split large text into smaller chunks before sending to Gemini.
+    This lowers timeout risk for dense bank statement pages.
+    """
+    chunks = split_text_into_chunks(text_content)
+    if not chunks:
+        return []
+    all_transactions = []
+    for idx, chunk in enumerate(chunks, start=1):
+        logging.info(f"Processing text chunk {idx}/{len(chunks)}")
+        result = call_gemini_with_retry(model, chunk, FINANCIAL_DOC_PROMPT)
+        all_transactions.extend(result.get('transactions', []))
+    return dedupe_transactions(all_transactions)
+def process_pdf_page_with_fallback(model, pdf_path: str, page_num: int, text_content: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
+    """
+    Page pipeline:
+    1. If text quality is good, try chunked text strategy.
+    2. On repeated 504/timeout/transient failure, fall back to vision for that page.
+    3. If text quality is poor, go directly to vision.
+    """
+    metrics = text_quality_metrics(text_content)
+    page_summary = {
+        "page": page_num,
+        "text_quality": metrics,
+        "strategy_used": None,
+        "fallback_triggered": False,
+        "transactions_extracted": 0,
+        "status": "pending",
+        "error": None,
+    }
+    try:
+        if should_use_text_strategy(text_content):
+            page_summary["strategy_used"] = "text"
+            logging.info(f"Page {page_num}: using chunked text strategy")
+            try:
+                txs = process_text_chunks_for_transactions(model, text_content)
+                page_summary["transactions_extracted"] = len(txs)
+                page_summary["status"] = "processed"
+                return txs, page_summary
+            except (GeminiTimeoutError, GeminiTransientError) as e:
+                logging.warning(f"Page {page_num}: text strategy failed, falling back to vision. Error: {e}")
+                page_summary["fallback_triggered"] = True
+                page_summary["error"] = f"text_strategy_failed: {str(e)}"
+                if PDF_IMAGE_SUPPORT:
+                    page_summary["strategy_used"] = "vision_fallback_after_text_failure"
+                    txs = process_pdf_page_as_image(model, pdf_path, page_num)
+                    txs = dedupe_transactions(txs)
+                    page_summary["transactions_extracted"] = len(txs)
+                    page_summary["status"] = "processed"
+                    return txs, page_summary
+                else:
+                    page_summary["status"] = "failed"
+                    page_summary["error"] = f"{page_summary['error']} | vision unavailable"
+                    return [], page_summary
+        else:
+            logging.info(f"Page {page_num}: poor/low text quality. Using vision strategy directly.")
+            page_summary["strategy_used"] = "vision_direct"
+            if PDF_IMAGE_SUPPORT:
+                txs = process_pdf_page_as_image(model, pdf_path, page_num)
+                txs = dedupe_transactions(txs)
+                page_summary["transactions_extracted"] = len(txs)
+                page_summary["status"] = "processed"
+                return txs, page_summary
+            else:
+                page_summary["status"] = "failed"
+                page_summary["error"] = "low-quality text and vision unavailable"
+                return [], page_summary
+    except Exception as e:
+        page_summary["status"] = "failed"
+        page_summary["error"] = str(e)
+        logging.error(f"Page {page_num}: final failure - {e}")
+        return [], page_summary
+# -------------------------------------------------------------------------
+# ROOT / HEALTH
+# -------------------------------------------------------------------------
+@app.route('/', methods=['GET'])
+def index():
+    return jsonify({
+        "message": "API is running",
+        "version": "2.4.0",
+        "model": MODEL_NAME,
+        "vision_support": PDF_IMAGE_SUPPORT,
+        "endpoints": [
+            "/health",
+            "/process-pdf",
+            "/process-text",
+            "/process-image",
+            "/transaction-types",
+            "/api/customers/parse-students-images",
+            "/api/customers/validate-students-import",
+            "/api/customers/parse-students-manual"
+        ]
+    })
+@app.route('/health', methods=['GET'])
+def health_check():
+    return jsonify({
+        'status': 'healthy',
+        'timestamp': datetime.now().isoformat(),
+        'version': '2.4.0',
+        'vision_support': PDF_IMAGE_SUPPORT,
+        'model': MODEL_NAME
+    })
+# -------------------------------------------------------------------------
+# FINANCIAL ENDPOINTS
+# -------------------------------------------------------------------------
 @app.route('/process-pdf', methods=['POST'])
 def process_pdf():
     """
     Smart PDF Processor:
     1. Checks if empty.
+    2. Tries text extraction per page.
+    3. Uses text-quality gating.
+    4. Chunk-processes large text pages.
+    5. On repeated 504/timeout/transient errors, falls back to Vision for that page.
+    6. Returns partial success instead of failing the whole document for one bad page.
     """
     temp_path = None
     try:
         if 'file' not in request.files:
             return jsonify({'error': 'No file uploaded'}), 400
         file = request.files['file']
         if file.filename == '':
             return jsonify({'error': 'No file selected'}), 400
         if is_file_empty(temp_path):
             return jsonify({'error': 'Uploaded file is empty'}), 400
+        if not api_key:
+            return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
         model = configure_gemini(api_key)
         all_transactions = []
+        pages_summary = []
+        failed_pages = []
         try:
             reader = pypdf.PdfReader(temp_path)
             num_pages = len(reader.pages)
             for i in range(num_pages):
+                page_num = i + 1
+                logging.info(f"Processing page {page_num}/{num_pages}")
                 try:
+                    text_content = reader.pages[i].extract_text() or ""
+                except Exception as e:
+                    logging.warning(f"Page {page_num}: text extraction failed: {e}")
                     text_content = ""
+                txs, page_summary = process_pdf_page_with_fallback(
+                    model=model,
+                    pdf_path=temp_path,
+                    page_num=page_num,
+                    text_content=text_content
+                )
                 all_transactions.extend(txs)
+                pages_summary.append(page_summary)
+                if page_summary["status"] != "processed":
+                    failed_pages.append({
+                        "page": page_num,
+                        "error": page_summary.get("error")
+                    })
         except pypdf.errors.PdfReadError:
             logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
             if PDF_IMAGE_SUPPORT:
                 images = convert_from_path(temp_path)
+                for idx, img in enumerate(images, start=1):
+                    page_summary = {
+                        "page": idx,
+                        "text_quality": None,
+                        "strategy_used": "vision_full_pdf_fallback",
+                        "fallback_triggered": True,
+                        "transactions_extracted": 0,
+                        "status": "pending",
+                        "error": None,
+                    }
+                    try:
+                        result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
+                        txs = dedupe_transactions(result.get('transactions', []))
+                        all_transactions.extend(txs)
+                        page_summary["transactions_extracted"] = len(txs)
+                        page_summary["status"] = "processed"
+                    except Exception as e:
+                        page_summary["status"] = "failed"
+                        page_summary["error"] = str(e)
+                        failed_pages.append({
+                            "page": idx,
+                            "error": str(e)
+                        })
+                    pages_summary.append(page_summary)
             else:
                 raise ValueError("PDF is unreadable and Vision fallback is unavailable.")
+        all_transactions = dedupe_transactions(all_transactions)
+        total_pages = len(pages_summary)
+        processed_pages = len([p for p in pages_summary if p["status"] == "processed"])
+        total_failed_pages = len([p for p in pages_summary if p["status"] != "processed"])
+        response_payload = {
+            'transactions': all_transactions,
+            'summary': {
+                'pages_total': total_pages,
+                'pages_processed': processed_pages,
+                'pages_failed': total_failed_pages,
+                'transactions_total': len(all_transactions),
+                'partial_success': total_failed_pages > 0 and processed_pages > 0,
+                'success': processed_pages > 0
+            },
+            'pages': pages_summary,
+            'failed_pages': failed_pages
+        }
+        if processed_pages == 0:
+            return jsonify(response_payload), 500
+        return jsonify(response_payload), 200
     except Exception as e:
         logging.error(f"Server Error: {e}")
         if temp_path and os.path.exists(temp_path):
             os.remove(temp_path)
 @app.route('/process-text', methods=['POST'])
 def process_text():
     """Handle raw text input."""
             return jsonify({'error': 'No text provided'}), 400
         text_input = data['text']
+        if not str(text_input).strip():
             return jsonify({'error': 'Text input cannot be empty'}), 400
+        if not api_key:
+            return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
         model = configure_gemini(api_key)
         prompt = get_text_prompt_with_fallback_date()
+        chunks = split_text_into_chunks(text_input)
+        all_transactions = []
+        for chunk in chunks or [text_input]:
+            result = call_gemini_with_retry(model, chunk, prompt)
+            all_transactions.extend(result.get('transactions', []))
+        return jsonify({'transactions': dedupe_transactions(all_transactions)})
     except Exception as e:
         logging.error(f"Error: {e}")
             file.save(tmp.name)
             temp_path = tmp.name
+        if not api_key:
+            return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
         model = configure_gemini(api_key)
         img = Image.open(temp_path)
         result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
+        return jsonify({'transactions': dedupe_transactions(result.get('transactions', []))})
     except Exception as e:
         logging.error(f"Error: {e}")
         if not uploaded_files:
             return jsonify({"error": "No files uploaded"}), 400
+        if not api_key:
+            return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
         template_fields = parse_json_safely(request.form.get("template_fields"), default={})
         model = configure_gemini(api_key)
     }
     return jsonify(transaction_types)
+# -------------------------------------------------------------------------
+# MAIN
+# -------------------------------------------------------------------------
 if __name__ == '__main__':
     app.run(debug=True, host="0.0.0.0", port=7860)