Spaces:

rairo
/

stmt-api

Running

App Files Files Community

rairo commited on Apr 23

Commit

9937c2b

verified ·

1 Parent(s): 3a70f45

Update main.py

Browse files

Files changed (1) hide show

main.py +129 -497

main.py CHANGED Viewed

@@ -6,7 +6,6 @@ import tempfile
 import time
 from datetime import datetime
 from io import BytesIO
-from typing import List, Dict, Any, Tuple
 # Third-party imports
 from flask import Flask, request, jsonify
@@ -35,39 +34,11 @@ api_key = os.getenv('Gemini')
 if not api_key:
     logging.warning("Gemini API key not found in environment variables.")
-# -------------------------------------------------------------------------
-# CONSTANTS
-# -------------------------------------------------------------------------
-MODEL_NAME = os.getenv("GEMINI_MODEL", "gemini-2.5-flash")
-MAX_TEXT_CHUNK_CHARS = int(os.getenv("MAX_TEXT_CHUNK_CHARS", "12000"))
-MAX_TEXT_CHUNK_LINES = int(os.getenv("MAX_TEXT_CHUNK_LINES", "120"))
-GEMINI_RETRIES = int(os.getenv("GEMINI_RETRIES", "3"))
-RETRY_BASE_SLEEP_SECONDS = float(os.getenv("RETRY_BASE_SLEEP_SECONDS", "2"))
-TEXT_MIN_MEANINGFUL_LENGTH = int(os.getenv("TEXT_MIN_MEANINGFUL_LENGTH", "80"))
-# -------------------------------------------------------------------------
-# CUSTOM EXCEPTIONS
-# -------------------------------------------------------------------------
-class GeminiTransientError(Exception):
-    pass
-class GeminiTimeoutError(GeminiTransientError):
-    pass
-class GeminiFatalError(Exception):
-    pass
-# -------------------------------------------------------------------------
-# GEMINI SETUP
-# -------------------------------------------------------------------------
 def configure_gemini(api_key):
     """Configure Gemini AI model."""
     try:
         genai.configure(api_key=api_key)
-        return genai.GenerativeModel(MODEL_NAME)
     except Exception as e:
         logging.error(f"Error configuring Gemini: {str(e)}")
         raise
@@ -85,9 +56,6 @@ RULES:
    - Do NOT use the current date (today) unless the document explicitly says "Today".
 2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND.
 3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals.
-4. For bank statements, focus on transaction rows only.
-5. Do not duplicate the same transaction.
-6. If a chunk appears to contain partial rows only, extract only rows that are sufficiently complete.
 FIELDS TO EXTRACT:
 - Date: string (DD/MM/YYYY)
@@ -168,7 +136,7 @@ def get_text_prompt_with_fallback_date():
     """
     current_date = datetime.now().strftime("%d/%m/%Y")
     return f"""IMPORTANT: Today's date is {current_date}.
-If the text below does not specify a year or date, reasonably assume {current_date} context, but prefer explicit dates in text.
 {FINANCIAL_DOC_PROMPT}
 """
@@ -181,9 +149,9 @@ def categorize_transaction(transaction):
     """
     Categorizes a transaction based strictly on its Type field.
     """
-    tx_type = str(transaction.get('Type', '')).lower()
-    description = str(transaction.get('Description', '')).lower()
-    destination = str(transaction.get('Destination_of_funds', '')).lower()
     account_category = "Uncategorized"
@@ -296,12 +264,9 @@ def categorize_transaction(transaction):
 # HELPER FUNCTIONS
 # -------------------------------------------------------------------------
-def normalize_whitespace(text: str) -> str:
-    return re.sub(r"[ \t]+", " ", text or "").strip()
 def extract_json_from_response(response_text):
     """Extract valid JSON from Gemini's response, handling Markdown fences."""
-    cleaned_text = re.sub(r'```json\s*', '', response_text or "", flags=re.IGNORECASE)
     cleaned_text = re.sub(r'```\s*', '', cleaned_text)
     match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
@@ -327,109 +292,75 @@ Broken JSON: {broken_json_string}"""
         logging.error(f"JSON repair failed: {e}")
         return {fallback_key: []}
-def is_timeout_error(exc: Exception) -> bool:
-    msg = str(exc).lower()
-    timeout_markers = [
-        "504",
-        "timed out",
-        "timeout",
-        "deadlineexceeded",
-        "deadline exceeded",
-        "gateway timeout",
-        "upstream request timeout",
-    ]
-    return any(marker in msg for marker in timeout_markers)
-def is_retryable_error(exc: Exception) -> bool:
-    msg = str(exc).lower()
-    retryable_markers = [
-        "429",
-        "resourceexhausted",
-        "unavailable",
-        "503",
-        "500",
-        "internal",
-        "connection reset",
-        "temporarily unavailable",
-        "service unavailable",
-        "rate limit",
-    ]
-    return is_timeout_error(exc) or any(marker in msg for marker in retryable_markers)
-def sleep_for_retry(attempt: int):
-    delay = RETRY_BASE_SLEEP_SECONDS * (attempt + 1)
-    time.sleep(delay)
-def post_process_financial_result(result: Dict[str, Any]) -> Dict[str, Any]:
-    if 'transactions' in result and isinstance(result['transactions'], list):
-        result['transactions'] = [categorize_transaction(tx) for tx in result['transactions']]
-    else:
-        result['transactions'] = []
-    return result
-def call_gemini_with_retry(model, content, prompt, retries=GEMINI_RETRIES):
     """
-    Generic runner for financial Gemini extraction.
-    Retries transient failures including 504s.
-    Raises GeminiTimeoutError specifically so caller can fall back to vision.
     """
-    last_exc = None
     for attempt in range(retries + 1):
         try:
             response = model.generate_content(
                 [prompt, content],
-                generation_config={
-                    "temperature": 0,
-                    "response_mime_type": "application/json",
-                }
             )
             try:
                 result = extract_json_from_response(response.text)
-                return post_process_financial_result(result)
             except ValueError as ve:
-                broken_json = str(ve)
-                repaired = repair_json_with_gemini(model, broken_json, fallback_key="transactions")
-                return post_process_financial_result(repaired)
         except Exception as e:
-            last_exc = e
-            if is_retryable_error(e):
                 logging.warning(
-                    f"Gemini transient error on attempt {attempt + 1}/{retries + 1}: {e}"
                 )
-                if attempt < retries:
-                    sleep_for_retry(attempt)
-                    continue
-                if is_timeout_error(e):
-                    raise GeminiTimeoutError(str(e))
-                raise GeminiTransientError(str(e))
-            logging.error(f"Gemini fatal error: {e}")
-            raise GeminiFatalError(str(e))
-    if last_exc:
-        if is_timeout_error(last_exc):
-            raise GeminiTimeoutError(str(last_exc))
-        raise GeminiTransientError(str(last_exc))
     return {"transactions": []}
-def call_gemini_students_with_retry(model, content, prompt, retries=GEMINI_RETRIES):
     """
-    Generic runner for student Gemini extraction.
     """
-    last_exc = None
     for attempt in range(retries + 1):
         try:
             response = model.generate_content(
                 [prompt, content],
-                generation_config={
-                    "temperature": 0,
-                    "response_mime_type": "application/json",
-                }
             )
             try:
@@ -437,36 +368,33 @@ def call_gemini_students_with_retry(model, content, prompt, retries=GEMINI_RETRI
                 if 'students' not in result or not isinstance(result.get('students'), list):
                     return {"students": []}
                 return result
             except ValueError as ve:
-                broken_json = str(ve)
-                repaired = repair_json_with_gemini(model, broken_json, fallback_key="students")
                 if 'students' not in repaired or not isinstance(repaired.get('students'), list):
                     return {"students": []}
                 return repaired
         except Exception as e:
-            last_exc = e
-            if is_retryable_error(e):
                 logging.warning(
-                    f"Gemini student transient error on attempt {attempt + 1}/{retries + 1}: {e}"
                 )
-                if attempt < retries:
-                    sleep_for_retry(attempt)
-                    continue
-                if is_timeout_error(e):
-                    raise GeminiTimeoutError(str(e))
-                raise GeminiTransientError(str(e))
-            logging.error(f"Gemini student import fatal error: {e}")
-            raise GeminiFatalError(str(e))
-    if last_exc:
-        if is_timeout_error(last_exc):
-            raise GeminiTimeoutError(str(last_exc))
-        raise GeminiTransientError(str(last_exc))
     return {"students": []}
 def is_file_empty(file_path):
     """Check if file is empty."""
     return os.path.getsize(file_path) == 0
@@ -517,140 +445,6 @@ def ensure_extra_fields_list(value):
         return cleaned
     return []
-# -------------------------------------------------------------------------
-# PDF / TEXT QUALITY HELPERS
-# -------------------------------------------------------------------------
-def text_quality_metrics(text: str) -> Dict[str, Any]:
-    text = text or ""
-    stripped = text.strip()
-    if not stripped:
-        return {
-            "length": 0,
-            "lines": 0,
-            "alpha_ratio": 0,
-            "digit_ratio": 0,
-            "weird_ratio": 1,
-            "date_hits": 0,
-            "amount_hits": 0,
-            "looks_usable": False,
-        }
-    length = len(stripped)
-    lines = [line.strip() for line in stripped.splitlines() if line.strip()]
-    joined = "\n".join(lines)
-    alpha_count = sum(1 for c in joined if c.isalpha())
-    digit_count = sum(1 for c in joined if c.isdigit())
-    printable_count = sum(1 for c in joined if c.isprintable())
-    weird_count = sum(
-        1 for c in joined
-        if c.isprintable() and not (c.isalnum() or c.isspace() or c in ".,:/()-_&+'*#")
-    )
-    total_chars = max(len(joined), 1)
-    alpha_ratio = alpha_count / total_chars
-    digit_ratio = digit_count / total_chars
-    weird_ratio = weird_count / total_chars
-    date_hits = len(re.findall(r'\b\d{1,2}[/-]\d{1,2}(?:[/-]\d{2,4})?\b', joined))
-    amount_hits = len(re.findall(r'\b\d{1,3}(?:[,\s]\d{3})*(?:\.\d{2})\b', joined))
-    looks_usable = (
-        length >= TEXT_MIN_MEANINGFUL_LENGTH and
-        weird_ratio < 0.20 and
-        printable_count > 0 and
-        (alpha_ratio > 0.15 or date_hits > 1 or amount_hits > 2)
-    )
-    return {
-        "length": length,
-        "lines": len(lines),
-        "alpha_ratio": round(alpha_ratio, 4),
-        "digit_ratio": round(digit_ratio, 4),
-        "weird_ratio": round(weird_ratio, 4),
-        "date_hits": date_hits,
-        "amount_hits": amount_hits,
-        "looks_usable": looks_usable,
-    }
-def should_use_text_strategy(text: str) -> bool:
-    metrics = text_quality_metrics(text)
-    return metrics["looks_usable"]
-def split_text_into_chunks(text: str, max_chars: int = MAX_TEXT_CHUNK_CHARS, max_lines: int = MAX_TEXT_CHUNK_LINES) -> List[str]:
-    """
-    Chunk large extracted page text to reduce timeout risk.
-    Keeps line grouping to preserve statement row structure.
-    """
-    lines = [line.rstrip() for line in (text or "").splitlines() if line.strip()]
-    if not lines:
-        return []
-    chunks = []
-    current_lines = []
-    current_len = 0
-    for line in lines:
-        proposed_len = current_len + len(line) + 1
-        if current_lines and (proposed_len > max_chars or len(current_lines) >= max_lines):
-            chunks.append("\n".join(current_lines))
-            current_lines = [line]
-            current_len = len(line) + 1
-        else:
-            current_lines.append(line)
-            current_len = proposed_len
-    if current_lines:
-        chunks.append("\n".join(current_lines))
-    return chunks
-def normalize_transaction(tx: Dict[str, Any]) -> Dict[str, Any]:
-    normalized = {
-        "Date": str(tx.get("Date", "")).strip(),
-        "Description": normalize_whitespace(str(tx.get("Description", "")).strip()),
-        "Customer_name": normalize_whitespace(str(tx.get("Customer_name", "N/A")).strip() or "N/A"),
-        "City": normalize_whitespace(str(tx.get("City", "N/A")).strip() or "N/A"),
-        "Amount": tx.get("Amount", 0),
-        "Type": str(tx.get("Type", "")).strip(),
-        "Destination_of_funds": normalize_whitespace(str(tx.get("Destination_of_funds", "")).strip()),
-        "Document_Type": str(tx.get("Document_Type", "")).strip(),
-        "Account_Category": str(tx.get("Account_Category", "")).strip(),
-    }
-    try:
-        normalized["Amount"] = float(normalized["Amount"])
-    except Exception:
-        normalized["Amount"] = 0.0
-    return normalized
-def dedupe_transactions(transactions: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
-    seen = set()
-    unique = []
-    for tx in transactions:
-        item = normalize_transaction(tx)
-        key = (
-            item.get("Date", "").lower(),
-            item.get("Description", "").lower(),
-            round(float(item.get("Amount", 0) or 0), 2),
-            item.get("Type", "").lower(),
-            item.get("Document_Type", "").lower(),
-        )
-        if key in seen:
-            continue
-        seen.add(key)
-        unique.append(item)
-    return unique
-# -------------------------------------------------------------------------
-# STUDENT HELPERS
-# -------------------------------------------------------------------------
 def build_student_prompt(template_fields=None):
     template_fields = template_fields or {}
@@ -674,9 +468,7 @@ PRE-IMPORT CONFIGURATION:
 """
 def normalize_student_record(student, template_fields=None, sequence=None):
-    """
-    Normalizes one parsed student record into the required shape.
-    """
     template_fields = template_fields or {}
     raw = student or {}
@@ -704,10 +496,8 @@ def normalize_student_record(student, template_fields=None, sequence=None):
                 mapped[canonical] = str(normalized_raw[alias]).strip()
                 break
-    alias_flat = {a for aliases in alias_map.values() for a in aliases}
     for key, value in normalized_raw.items():
-        if key in alias_flat:
             continue
         if key == "extra_fields":
             continue
@@ -891,12 +681,10 @@ def parse_students_from_pdf(model, pdf_path, template_fields=None):
             except Exception:
                 text_content = ""
-            if should_use_text_strategy(text_content):
-                chunks = split_text_into_chunks(text_content)
-                for chunk in chunks:
-                    result = call_gemini_students_with_retry(model, chunk, prompt)
-                    page_students = result.get('students', []) or []
-                    all_students.extend(page_students)
             else:
                 if PDF_IMAGE_SUPPORT:
                     page_students = process_student_pdf_page_as_image(
@@ -948,7 +736,7 @@ def read_spreadsheet_students(file_path, filename, template_fields=None):
     return parse_students_from_dataframe(df, template_fields=template_fields)
 # -------------------------------------------------------------------------
-# CORE LOGIC: PDF PROCESSING (TEXT + CHUNKING + VISION FALLBACK)
 # -------------------------------------------------------------------------
 def process_pdf_page_as_image(model, pdf_path, page_num):
@@ -963,139 +751,19 @@ def process_pdf_page_as_image(model, pdf_path, page_num):
     result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
     return result.get('transactions', [])
-def process_text_chunks_for_transactions(model, text_content: str) -> List[Dict[str, Any]]:
-    """
-    Split large text into smaller chunks before sending to Gemini.
-    This lowers timeout risk for dense bank statement pages.
-    """
-    chunks = split_text_into_chunks(text_content)
-    if not chunks:
-        return []
-    all_transactions = []
-    for idx, chunk in enumerate(chunks, start=1):
-        logging.info(f"Processing text chunk {idx}/{len(chunks)}")
-        result = call_gemini_with_retry(model, chunk, FINANCIAL_DOC_PROMPT)
-        all_transactions.extend(result.get('transactions', []))
-    return dedupe_transactions(all_transactions)
-def process_pdf_page_with_fallback(model, pdf_path: str, page_num: int, text_content: str) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
-    """
-    Page pipeline:
-    1. If text quality is good, try chunked text strategy.
-    2. On repeated 504/timeout/transient failure, fall back to vision for that page.
-    3. If text quality is poor, go directly to vision.
-    """
-    metrics = text_quality_metrics(text_content)
-    page_summary = {
-        "page": page_num,
-        "text_quality": metrics,
-        "strategy_used": None,
-        "fallback_triggered": False,
-        "transactions_extracted": 0,
-        "status": "pending",
-        "error": None,
-    }
-    try:
-        if should_use_text_strategy(text_content):
-            page_summary["strategy_used"] = "text"
-            logging.info(f"Page {page_num}: using chunked text strategy")
-            try:
-                txs = process_text_chunks_for_transactions(model, text_content)
-                page_summary["transactions_extracted"] = len(txs)
-                page_summary["status"] = "processed"
-                return txs, page_summary
-            except (GeminiTimeoutError, GeminiTransientError) as e:
-                logging.warning(f"Page {page_num}: text strategy failed, falling back to vision. Error: {e}")
-                page_summary["fallback_triggered"] = True
-                page_summary["error"] = f"text_strategy_failed: {str(e)}"
-                if PDF_IMAGE_SUPPORT:
-                    page_summary["strategy_used"] = "vision_fallback_after_text_failure"
-                    txs = process_pdf_page_as_image(model, pdf_path, page_num)
-                    txs = dedupe_transactions(txs)
-                    page_summary["transactions_extracted"] = len(txs)
-                    page_summary["status"] = "processed"
-                    return txs, page_summary
-                else:
-                    page_summary["status"] = "failed"
-                    page_summary["error"] = f"{page_summary['error']} | vision unavailable"
-                    return [], page_summary
-        else:
-            logging.info(f"Page {page_num}: poor/low text quality. Using vision strategy directly.")
-            page_summary["strategy_used"] = "vision_direct"
-            if PDF_IMAGE_SUPPORT:
-                txs = process_pdf_page_as_image(model, pdf_path, page_num)
-                txs = dedupe_transactions(txs)
-                page_summary["transactions_extracted"] = len(txs)
-                page_summary["status"] = "processed"
-                return txs, page_summary
-            else:
-                page_summary["status"] = "failed"
-                page_summary["error"] = "low-quality text and vision unavailable"
-                return [], page_summary
-    except Exception as e:
-        page_summary["status"] = "failed"
-        page_summary["error"] = str(e)
-        logging.error(f"Page {page_num}: final failure - {e}")
-        return [], page_summary
-# -------------------------------------------------------------------------
-# ROOT / HEALTH
-# -------------------------------------------------------------------------
-@app.route('/', methods=['GET'])
-def index():
-    return jsonify({
-        "message": "API is running",
-        "version": "2.4.0",
-        "model": MODEL_NAME,
-        "vision_support": PDF_IMAGE_SUPPORT,
-        "endpoints": [
-            "/health",
-            "/process-pdf",
-            "/process-text",
-            "/process-image",
-            "/transaction-types",
-            "/api/customers/parse-students-images",
-            "/api/customers/validate-students-import",
-            "/api/customers/parse-students-manual"
-        ]
-    })
-@app.route('/health', methods=['GET'])
-def health_check():
-    return jsonify({
-        'status': 'healthy',
-        'timestamp': datetime.now().isoformat(),
-        'version': '2.4.0',
-        'vision_support': PDF_IMAGE_SUPPORT,
-        'model': MODEL_NAME
-    })
-# -------------------------------------------------------------------------
-# FINANCIAL ENDPOINTS
-# -------------------------------------------------------------------------
 @app.route('/process-pdf', methods=['POST'])
 def process_pdf():
     """
     Smart PDF Processor:
     1. Checks if empty.
-    2. Tries text extraction per page.
-    3. Uses text-quality gating.
-    4. Chunk-processes large text pages.
-    5. On repeated 504/timeout/transient errors, falls back to Vision for that page.
-    6. Returns partial success instead of failing the whole document for one bad page.
     """
     temp_path = None
     try:
         if 'file' not in request.files:
             return jsonify({'error': 'No file uploaded'}), 400
         file = request.files['file']
         if file.filename == '':
             return jsonify({'error': 'No file selected'}), 400
@@ -1107,99 +775,69 @@ def process_pdf():
         if is_file_empty(temp_path):
             return jsonify({'error': 'Uploaded file is empty'}), 400
-        if not api_key:
-            return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
         model = configure_gemini(api_key)
         all_transactions = []
-        pages_summary = []
-        failed_pages = []
         try:
             reader = pypdf.PdfReader(temp_path)
             num_pages = len(reader.pages)
             for i in range(num_pages):
-                page_num = i + 1
-                logging.info(f"Processing page {page_num}/{num_pages}")
                 try:
-                    text_content = reader.pages[i].extract_text() or ""
-                except Exception as e:
-                    logging.warning(f"Page {page_num}: text extraction failed: {e}")
                     text_content = ""
-                txs, page_summary = process_pdf_page_with_fallback(
-                    model=model,
-                    pdf_path=temp_path,
-                    page_num=page_num,
-                    text_content=text_content
-                )
-                all_transactions.extend(txs)
-                pages_summary.append(page_summary)
-                if page_summary["status"] != "processed":
-                    failed_pages.append({
-                        "page": page_num,
-                        "error": page_summary.get("error")
-                    })
         except pypdf.errors.PdfReadError:
             logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
             if PDF_IMAGE_SUPPORT:
                 images = convert_from_path(temp_path)
-                for idx, img in enumerate(images, start=1):
-                    page_summary = {
-                        "page": idx,
-                        "text_quality": None,
-                        "strategy_used": "vision_full_pdf_fallback",
-                        "fallback_triggered": True,
-                        "transactions_extracted": 0,
-                        "status": "pending",
-                        "error": None,
-                    }
-                    try:
-                        result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
-                        txs = dedupe_transactions(result.get('transactions', []))
-                        all_transactions.extend(txs)
-                        page_summary["transactions_extracted"] = len(txs)
-                        page_summary["status"] = "processed"
-                    except Exception as e:
-                        page_summary["status"] = "failed"
-                        page_summary["error"] = str(e)
-                        failed_pages.append({
-                            "page": idx,
-                            "error": str(e)
-                        })
-                    pages_summary.append(page_summary)
             else:
                 raise ValueError("PDF is unreadable and Vision fallback is unavailable.")
-        all_transactions = dedupe_transactions(all_transactions)
-        total_pages = len(pages_summary)
-        processed_pages = len([p for p in pages_summary if p["status"] == "processed"])
-        total_failed_pages = len([p for p in pages_summary if p["status"] != "processed"])
-        response_payload = {
-            'transactions': all_transactions,
-            'summary': {
-                'pages_total': total_pages,
-                'pages_processed': processed_pages,
-                'pages_failed': total_failed_pages,
-                'transactions_total': len(all_transactions),
-                'partial_success': total_failed_pages > 0 and processed_pages > 0,
-                'success': processed_pages > 0
-            },
-            'pages': pages_summary,
-            'failed_pages': failed_pages
-        }
-        if processed_pages == 0:
-            return jsonify(response_payload), 500
-        return jsonify(response_payload), 200
     except Exception as e:
         logging.error(f"Server Error: {e}")
@@ -1208,6 +846,10 @@ def process_pdf():
         if temp_path and os.path.exists(temp_path):
             os.remove(temp_path)
 @app.route('/process-text', methods=['POST'])
 def process_text():
     """Handle raw text input."""
@@ -1217,23 +859,14 @@ def process_text():
             return jsonify({'error': 'No text provided'}), 400
         text_input = data['text']
-        if not str(text_input).strip():
             return jsonify({'error': 'Text input cannot be empty'}), 400
-        if not api_key:
-            return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
         model = configure_gemini(api_key)
         prompt = get_text_prompt_with_fallback_date()
-        chunks = split_text_into_chunks(text_input)
-        all_transactions = []
-        for chunk in chunks or [text_input]:
-            result = call_gemini_with_retry(model, chunk, prompt)
-            all_transactions.extend(result.get('transactions', []))
-        return jsonify({'transactions': dedupe_transactions(all_transactions)})
     except Exception as e:
         logging.error(f"Error: {e}")
@@ -1258,14 +891,11 @@ def process_image():
             file.save(tmp.name)
             temp_path = tmp.name
-        if not api_key:
-            return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
         model = configure_gemini(api_key)
         img = Image.open(temp_path)
         result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
-        return jsonify({'transactions': dedupe_transactions(result.get('transactions', []))})
     except Exception as e:
         logging.error(f"Error: {e}")
@@ -1299,9 +929,6 @@ def parse_students_images():
         if not uploaded_files:
             return jsonify({"error": "No files uploaded"}), 400
-        if not api_key:
-            return jsonify({'error': 'Gemini API key is missing from environment variables'}), 500
         template_fields = parse_json_safely(request.form.get("template_fields"), default={})
         model = configure_gemini(api_key)
@@ -1528,9 +1155,14 @@ def get_transaction_types():
     }
     return jsonify(transaction_types)
-# -------------------------------------------------------------------------
-# MAIN
-# -------------------------------------------------------------------------
 if __name__ == '__main__':
     app.run(debug=True, host="0.0.0.0", port=7860)

 import time
 from datetime import datetime
 from io import BytesIO
 # Third-party imports
 from flask import Flask, request, jsonify
 if not api_key:
     logging.warning("Gemini API key not found in environment variables.")
 def configure_gemini(api_key):
     """Configure Gemini AI model."""
     try:
         genai.configure(api_key=api_key)
+        return genai.GenerativeModel('gemini-2.5-flash')
     except Exception as e:
         logging.error(f"Error configuring Gemini: {str(e)}")
         raise
    - Do NOT use the current date (today) unless the document explicitly says "Today".
 2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND.
 3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals.
 FIELDS TO EXTRACT:
 - Date: string (DD/MM/YYYY)
     """
     current_date = datetime.now().strftime("%d/%m/%Y")
     return f"""IMPORTANT: Today's date is {current_date}.
+If the text below does not specify a year or date, reasonable assume {current_date} context, but prefer explicit dates in text.
 {FINANCIAL_DOC_PROMPT}
 """
     """
     Categorizes a transaction based strictly on its Type field.
     """
+    tx_type = transaction.get('Type', '').lower()
+    description = transaction.get('Description', '').lower()
+    destination = transaction.get('Destination_of_funds', '').lower()
     account_category = "Uncategorized"
 # HELPER FUNCTIONS
 # -------------------------------------------------------------------------
 def extract_json_from_response(response_text):
     """Extract valid JSON from Gemini's response, handling Markdown fences."""
+    cleaned_text = re.sub(r'```json\s*', '', response_text)
     cleaned_text = re.sub(r'```\s*', '', cleaned_text)
     match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
         logging.error(f"JSON repair failed: {e}")
         return {fallback_key: []}
+# -------------------------------------------------------------------------
+# RETRYABLE ERROR DETECTION
+# -------------------------------------------------------------------------
+RETRYABLE_CODES = ["429", "503", "504", "ResourceExhausted", "DeadlineExceeded", "UNAVAILABLE"]
+def is_retryable(error: Exception) -> bool:
+    err = str(error)
+    return any(code in err for code in RETRYABLE_CODES)
+# -------------------------------------------------------------------------
+# CORE GEMINI CALLERS WITH RETRY + 504 HANDLING
+# -------------------------------------------------------------------------
+def call_gemini_with_retry(model, content, prompt, retries=3):
     """
+    Financial extraction runner.
+    Retries on 429, 503, 504 / DeadlineExceeded with exponential backoff.
     """
     for attempt in range(retries + 1):
         try:
             response = model.generate_content(
                 [prompt, content],
+                request_options={"timeout": 120}
             )
             try:
                 result = extract_json_from_response(response.text)
+                if 'transactions' in result:
+                    result['transactions'] = [
+                        categorize_transaction(tx) for tx in result['transactions']
+                    ]
+                return result
             except ValueError as ve:
+                repaired = repair_json_with_gemini(model, str(ve), fallback_key="transactions")
+                if 'transactions' in repaired:
+                    repaired['transactions'] = [
+                        categorize_transaction(tx) for tx in repaired['transactions']
+                    ]
+                return repaired
         except Exception as e:
+            if is_retryable(e) and attempt < retries:
+                wait = 2 ** attempt  # 1s → 2s → 4s
                 logging.warning(
+                    f"Gemini retryable error (attempt {attempt + 1}/{retries + 1}): {e}. "
+                    f"Retrying in {wait}s..."
                 )
+                time.sleep(wait)
+                continue
+            logging.error(f"Gemini Error: {e}")
+            if attempt == retries:
+                raise
     return {"transactions": []}
+def call_gemini_students_with_retry(model, content, prompt, retries=3):
     """
+    Student extraction runner.
+    Retries on 429, 503, 504 / DeadlineExceeded with exponential backoff.
     """
     for attempt in range(retries + 1):
         try:
             response = model.generate_content(
                 [prompt, content],
+                request_options={"timeout": 120}
             )
             try:
                 if 'students' not in result or not isinstance(result.get('students'), list):
                     return {"students": []}
                 return result
             except ValueError as ve:
+                repaired = repair_json_with_gemini(model, str(ve), fallback_key="students")
                 if 'students' not in repaired or not isinstance(repaired.get('students'), list):
                     return {"students": []}
                 return repaired
         except Exception as e:
+            if is_retryable(e) and attempt < retries:
+                wait = 2 ** attempt
                 logging.warning(
+                    f"Gemini Student retryable error (attempt {attempt + 1}/{retries + 1}): {e}. "
+                    f"Retrying in {wait}s..."
                 )
+                time.sleep(wait)
+                continue
+            logging.error(f"Gemini Student Import Error: {e}")
+            if attempt == retries:
+                raise
     return {"students": []}
+# -------------------------------------------------------------------------
+# UTILITY FUNCTIONS
+# -------------------------------------------------------------------------
 def is_file_empty(file_path):
     """Check if file is empty."""
     return os.path.getsize(file_path) == 0
         return cleaned
     return []
 def build_student_prompt(template_fields=None):
     template_fields = template_fields or {}
 """
 def normalize_student_record(student, template_fields=None, sequence=None):
+    """Normalizes one parsed student record into the required shape."""
     template_fields = template_fields or {}
     raw = student or {}
                 mapped[canonical] = str(normalized_raw[alias]).strip()
                 break
     for key, value in normalized_raw.items():
+        if key in {a for aliases in alias_map.values() for a in aliases}:
             continue
         if key == "extra_fields":
             continue
             except Exception:
                 text_content = ""
+            if text_content and len(text_content.strip()) > 50:
+                result = call_gemini_students_with_retry(model, text_content, prompt)
+                page_students = result.get('students', []) or []
+                all_students.extend(page_students)
             else:
                 if PDF_IMAGE_SUPPORT:
                     page_students = process_student_pdf_page_as_image(
     return parse_students_from_dataframe(df, template_fields=template_fields)
 # -------------------------------------------------------------------------
+# CORE LOGIC: PDF PROCESSING (HYBRID TEXT + VISION)
 # -------------------------------------------------------------------------
 def process_pdf_page_as_image(model, pdf_path, page_num):
     result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
     return result.get('transactions', [])
 @app.route('/process-pdf', methods=['POST'])
 def process_pdf():
     """
     Smart PDF Processor:
     1. Checks if empty.
+    2. Tries standard Text extraction per page.
+    3. On 504/timeout or low text, falls back to Vision per page.
+    4. If Vision also fails, skips page and continues (no full document crash).
     """
     temp_path = None
     try:
         if 'file' not in request.files:
             return jsonify({'error': 'No file uploaded'}), 400
         file = request.files['file']
         if file.filename == '':
             return jsonify({'error': 'No file selected'}), 400
         if is_file_empty(temp_path):
             return jsonify({'error': 'Uploaded file is empty'}), 400
         model = configure_gemini(api_key)
         all_transactions = []
         try:
             reader = pypdf.PdfReader(temp_path)
             num_pages = len(reader.pages)
             for i in range(num_pages):
+                logging.info(f"Processing page {i+1}/{num_pages}")
+                # --- Extract text ---
                 try:
+                    text_content = reader.pages[i].extract_text()
+                except Exception:
                     text_content = ""
+                txs = []
+                # --- Text strategy ---
+                if text_content and len(text_content.strip()) > 50:
+                    logging.info("Text detected. Using Text Strategy.")
+                    try:
+                        result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT)
+                        txs = result.get('transactions', [])
+                    except Exception as text_err:
+                        # 504 / any failure on text path → fall through to vision
+                        logging.warning(
+                            f"Text strategy failed on page {i+1}: {text_err}. "
+                            f"Falling back to Vision."
+                        )
+                        text_content = ""  # force vision branch below
+                # --- Vision fallback (low text OR text strategy failure) ---
+                if not txs and (not text_content or len(text_content.strip()) <= 50):
+                    if PDF_IMAGE_SUPPORT:
+                        logging.info(
+                            f"Page {i+1}: Using Vision Strategy."
+                        )
+                        try:
+                            txs = process_pdf_page_as_image(model, temp_path, i + 1)
+                        except Exception as vision_err:
+                            logging.error(
+                                f"Vision also failed on page {i+1}: {vision_err}. Skipping page."
+                            )
+                            txs = []
+                    else:
+                        logging.warning(
+                            f"Page {i+1}: Low/no text and Vision unavailable. Skipping."
+                        )
+                all_transactions.extend(txs)
         except pypdf.errors.PdfReadError:
             logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
             if PDF_IMAGE_SUPPORT:
                 images = convert_from_path(temp_path)
+                for img in images:
+                    result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
+                    all_transactions.extend(result.get('transactions', []))
             else:
                 raise ValueError("PDF is unreadable and Vision fallback is unavailable.")
+        return jsonify({'transactions': all_transactions})
     except Exception as e:
         logging.error(f"Server Error: {e}")
         if temp_path and os.path.exists(temp_path):
             os.remove(temp_path)
+# -------------------------------------------------------------------------
+# TEXT & IMAGE ENDPOINTS
+# -------------------------------------------------------------------------
 @app.route('/process-text', methods=['POST'])
 def process_text():
     """Handle raw text input."""
             return jsonify({'error': 'No text provided'}), 400
         text_input = data['text']
+        if not text_input.strip():
             return jsonify({'error': 'Text input cannot be empty'}), 400
         model = configure_gemini(api_key)
         prompt = get_text_prompt_with_fallback_date()
+        result = call_gemini_with_retry(model, text_input, prompt)
+        return jsonify({'transactions': result.get('transactions', [])})
     except Exception as e:
         logging.error(f"Error: {e}")
             file.save(tmp.name)
             temp_path = tmp.name
         model = configure_gemini(api_key)
         img = Image.open(temp_path)
         result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
+        return jsonify({'transactions': result.get('transactions', [])})
     except Exception as e:
         logging.error(f"Error: {e}")
         if not uploaded_files:
             return jsonify({"error": "No files uploaded"}), 400
         template_fields = parse_json_safely(request.form.get("template_fields"), default={})
         model = configure_gemini(api_key)
     }
     return jsonify(transaction_types)
+@app.route('/health', methods=['GET'])
+def health_check():
+    return jsonify({
+        'status': 'healthy',
+        'timestamp': datetime.now().isoformat(),
+        'version': '2.4.0',
+        'vision_support': PDF_IMAGE_SUPPORT
+    })
 if __name__ == '__main__':
     app.run(debug=True, host="0.0.0.0", port=7860)