Spaces:

rairo
/

stmt-api

Running

File size: 23,910 Bytes

import os
import json
import logging
import re
import tempfile
import time
from datetime import datetime
from io import BytesIO

# Third-party imports
from flask import Flask, request, jsonify
from flask_cors import CORS
import pandas as pd
import pypdf
import google.generativeai as genai
from PIL import Image

# specific import for image fallback
try:
    from pdf2image import convert_from_path
    PDF_IMAGE_SUPPORT = True
except ImportError:
    PDF_IMAGE_SUPPORT = False
    logging.warning("pdf2image not installed. Scanned/Encrypted PDF fallback will not work.")

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

app = Flask(__name__)
CORS(app)

# Get API key securely
api_key = os.getenv('Gemini')
if not api_key:
    # Fallback for local testing if env var not set, though env var is preferred
    logging.warning("Gemini API key not found in environment variables.")

def configure_gemini(api_key):
    """Configure Gemini AI model."""
    try:
        genai.configure(api_key=api_key)
        # Using 2.0 Flash as it has superior vision and long-context capabilities
        return genai.GenerativeModel('gemini-2.0-flash')
    except Exception as e:
        logging.error(f"Error configuring Gemini: {str(e)}")
        raise

# -------------------------------------------------------------------------
# PROMPTS
# -------------------------------------------------------------------------

# Enhanced Prompt for General Financial Documents (Statements, Invoices, Receipts)
# Addresses Point 1 (Rounding/Dates) & Point 3 (Document Types)
FINANCIAL_DOC_PROMPT = """Analyze this financial document (which could be a Bank Statement, Invoice, Receipt, or Transaction List).
Extract all relevant transactions/items in JSON format.

RULES:
1. **Dates**: Extract the date printed on the document. Format as DD/MM/YYYY.
   - If the year is missing in the row, use the document's context (e.g., header date).
   - Do NOT use the current date (today) unless the document explicitly says "Today".
2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND.
3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals.

FIELDS TO EXTRACT:
- Date: string (DD/MM/YYYY)
- Description: string (Full description of item/transaction)
- Amount: number (Float, exact value)
- Type: string (Categorize exactly as one of: 'income', 'expense', 'asset', 'liability', 'equity', 'transfer', 'investment', 'loan_repayment', 'capital_injection')
- Customer_name: string (If 'income', name of payer. If 'expense', name of payee/vendor. Else 'N/A')
- City: string (Extract from address if present, else 'N/A')
- Document_Type: string (Infer: 'statement', 'invoice', 'receipt', 'transaction_list')
- Destination_of_funds: string (Categorize based on description. e.g., 'Salaries', 'Fuel', 'Rentals', 'Equipment', etc.)

RETURN STRUCTURE:
{
  "transactions": [
    {
      "Date": "DD/MM/YYYY",
      "Description": "Item Description",
      "Customer_name": "Vendor or Payer",
      "City": "City Name",
      "Amount": 123.45,
      "Type": "expense",
      "Destination_of_funds": "Category",
      "Document_Type": "invoice"
    }
  ]
}

Return ONLY raw JSON. No markdown formatting.
"""

def get_text_prompt_with_fallback_date():
    """
    Generate prompt for raw text snippets where context might be missing.
    Only allows current date fallback for raw text, not PDFs.
    """
    current_date = datetime.now().strftime("%d/%m/%Y")
    return f"""IMPORTANT: Today's date is {current_date}.
If the text below does not specify a year or date, reasonable assume {current_date} context, but prefer explicit dates in text.

{FINANCIAL_DOC_PROMPT}
"""

# -------------------------------------------------------------------------
# CATEGORIZATION LOGIC - TYPE-BASED (FIX FOR THE BUG)
# -------------------------------------------------------------------------

def categorize_transaction(transaction):
    """
    Categorizes a transaction based strictly on its Type field.
    This prevents keyword-based misclassification.
    
    Args:
        transaction: dict with keys including 'Type', 'Description', 'Destination_of_funds'
    
    Returns:
        dict with added 'Account_Category' field
    """
    tx_type = transaction.get('Type', '').lower()
    description = transaction.get('Description', '').lower()
    destination = transaction.get('Destination_of_funds', '').lower()
    
    # Add the categorized account field
    account_category = "Uncategorized"
    
    # ========== INCOME TYPE ==========
    if tx_type == 'income':
        # All income should map to revenue accounts, NOT expenses
        if any(keyword in description for keyword in ['sales', 'service', 'revenue', 'invoice']):
            account_category = "Sales Revenue"
        elif any(keyword in description for keyword in ['interest', 'dividend']):
            account_category = "Interest Income"
        elif any(keyword in description for keyword in ['transfer', 'deposit', 'payment']):
            # This fixes the "Income Trap" - transfers FROM others are income
            account_category = "Other Income"
        else:
            account_category = "Other Income"
    
    # ========== EXPENSE TYPE ==========
    elif tx_type == 'expense':
        # Map based on Destination_of_funds or description keywords
        # This is TYPE-FIRST, so "cash" in description won't make it an asset
        
        # Specific expense categories based on your system
        if 'salaries' in destination or 'wages' in destination or 'salary' in description:
            account_category = "Salaries and Wages"
        elif 'water' in destination or 'electricity' in destination:
            account_category = "Water and Electricity"
        elif 'fuel' in destination or 'petrol' in description:
            account_category = "Fuel"
        elif 'rental' in destination or 'rent' in description:
            account_category = "Rentals"
        elif 'marketing' in destination or 'advertising' in destination:
            account_category = "Advertising & Marketing"
        elif 'repair' in destination or 'maintenance' in destination:
            account_category = "Repairs & Maintenance"
        elif 'vehicle' in destination or 'motor' in destination:
            account_category = "Motor Vehicle Expenses"
        elif 'hardware' in destination:
            account_category = "Hardware Expenses"
        elif 'accounting' in destination:
            account_category = "Accounting Fees"
        elif 'insurance' in destination:
            account_category = "Insurance"
        elif 'bank' in destination and 'charge' in destination:
            account_category = "Bank Charges"
        elif 'loan' in destination and 'interest' in destination:
            account_category = "Loan Interest"
        elif 'subscription' in destination:
            account_category = "Subscriptions"
        elif 'internet' in destination or 'telephone' in destination:
            account_category = "Computer Internet and Telephone"
        elif 'training' in destination:
            account_category = "Staff Training"
        elif 'travel' in destination or 'accommodation' in destination:
            account_category = "Travel and Accommodation"
        elif 'depreciation' in destination:
            account_category = "Depreciation"
        
        # Special cases based on description (but still respecting expense type)
        elif 'atm' in description and 'cash' in description:
            # This fixes the "Cash Trap" - ATM withdrawals are drawings, not assets
            account_category = "Owner's Drawings"
        elif 'payment to' in description:
            # Payment to suppliers/vendors
            if any(word in description for word in ['fabric', 'printing', 'material']):
                account_category = "Cost of Sales"
            else:
                account_category = "Miscellaneous Expense"
        else:
            account_category = "Miscellaneous Expense"
    
    # ========== ASSET TYPE ==========
    elif tx_type == 'asset':
        if 'equipment' in destination or 'equipment' in description:
            account_category = "Equipment"
        elif 'vehicle' in destination or 'vehicle' in description:
            account_category = "Vehicles"
        elif 'property' in destination or 'property' in description:
            account_category = "Property"
        elif 'technology' in destination or 'computer' in description:
            account_category = "Technology"
        elif 'furniture' in destination:
            account_category = "Furniture"
        else:
            account_category = "Other Assets"
    
    # ========== LIABILITY TYPE ==========
    elif tx_type == 'liability':
        if 'bank loan' in destination or 'loan' in description:
            account_category = "Bank Loan"
        elif 'credit' in destination:
            account_category = "Credit Facility"
        else:
            account_category = "Other Liabilities"
    
    # ========== EQUITY TYPE ==========
    elif tx_type == 'equity':
        if 'owner' in destination or 'capital' in description:
            account_category = "Owner Investment"
        elif 'retained' in destination:
            account_category = "Retained Earnings"
        else:
            account_category = "Other Equity"
    
    # ========== TRANSFER TYPE ==========
    elif tx_type == 'transfer':
        account_category = "Internal Transfer"
    
    # ========== INVESTMENT TYPE ==========
    elif tx_type == 'investment':
        if 'securities' in destination or 'stock' in description:
            account_category = "Securities"
        elif 'mutual' in destination:
            account_category = "Mutual Funds"
        else:
            account_category = "Other Investments"
    
    # ========== LOAN REPAYMENT TYPE ==========
    elif tx_type == 'loan_repayment':
        account_category = "Loan Repayment"
    
    # ========== CAPITAL INJECTION TYPE ==========
    elif tx_type == 'capital_injection':
        account_category = "Capital Injection"
    
    # Add the category to the transaction
    transaction['Account_Category'] = account_category
    return transaction

# -------------------------------------------------------------------------
# HELPER FUNCTIONS
# -------------------------------------------------------------------------

def extract_json_from_response(response_text):
    """Extract valid JSON from Gemini's response, handling Markdown fences."""
    # Remove markdown code blocks
    cleaned_text = re.sub(r'```json\s*', '', response_text)
    cleaned_text = re.sub(r'```\s*', '', cleaned_text)
    
    # Find JSON object
    match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
    if match:
        json_string = match.group(1)
    else:
        # Fallback: assume the whole text is JSON
        json_string = cleaned_text

    try:
        return json.loads(json_string)
    except json.JSONDecodeError:
        logging.warning("JSON parsing failed, attempting repair.")
        raise ValueError(json_string) # Pass invalid string to caller for repair

def repair_json_with_gemini(model, broken_json_string):
    """Uses Gemini to fix broken JSON syntax."""
    repair_prompt = f"""Fix this broken JSON string. Return ONLY valid JSON.
    Broken JSON: {broken_json_string}"""
    try:
        resp = model.generate_content(repair_prompt)
        return extract_json_from_response(resp.text)
    except Exception as e:
        logging.error(f"JSON repair failed: {e}")
        return {"transactions": []} # Fail safe

def call_gemini_with_retry(model, content, prompt, retries=2):
    """
    Generic runner for Gemini.
    Args:
        content: Can be a String (text) or a PIL.Image object (vision).
    """
    for attempt in range(retries + 1):
        try:
            # Gemini Python SDK handles [Prompt, Image] or [Prompt, Text] automatically
            response = model.generate_content([prompt, content])
            
            try:
                result = extract_json_from_response(response.text)
                
                # POST-PROCESSING: Categorize each transaction based on Type
                if 'transactions' in result:
                    result['transactions'] = [
                        categorize_transaction(tx) for tx in result['transactions']
                    ]
                
                return result
            except ValueError as ve:
                # Value error here contains the broken JSON string
                broken_json = str(ve)
                repaired = repair_json_with_gemini(model, broken_json)
                
                # Categorize repaired transactions too
                if 'transactions' in repaired:
                    repaired['transactions'] = [
                        categorize_transaction(tx) for tx in repaired['transactions']
                    ]
                
                return repaired
                
        except Exception as e:
            if "429" in str(e) or "ResourceExhausted" in str(e):
                time.sleep(2 * (attempt + 1))
                continue
            logging.error(f"Gemini Error: {e}")
            if attempt == retries:
                raise

    return {"transactions": []}

def is_file_empty(file_path):
    """Check if file is empty."""
    return os.path.getsize(file_path) == 0

# -------------------------------------------------------------------------
# CORE LOGIC: PDF PROCESSING (HYBRID TEXT + VISION)
# -------------------------------------------------------------------------

def process_pdf_page_as_image(model, pdf_path, page_num):
    """Point 4: Convert specific PDF page to image and process with Vision."""
    if not PDF_IMAGE_SUPPORT:
        raise ImportError("pdf2image/poppler not installed")

    # Convert specific page to image
    # first_page=page_num, last_page=page_num ensures we only convert 1 page at a time to save RAM
    images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
    if not images:
        return []
    
    # Process the image
    result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
    return result.get('transactions', [])

@app.route('/process-pdf', methods=['POST'])
def process_pdf():
    """
    Smart PDF Processor:
    1. Checks if empty.
    2. Tries standard Text extraction (Fast/Cheap).
    3. If Text fails (Encryption) or is empty (Scanned), falls back to Vision (Slow/Powerful).
    """
    temp_path = None
    try:
        # 1. Validation
        if 'file' not in request.files:
            return jsonify({'error': 'No file uploaded'}), 400
        file = request.files['file']
        if file.filename == '':
            return jsonify({'error': 'No file selected'}), 400

        # Save Temp
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
            file.save(tmp.name)
            temp_path = tmp.name

        # Point 2: Empty File Check
        if is_file_empty(temp_path):
             return jsonify({'error': 'Uploaded file is empty'}), 400

        model = configure_gemini(api_key)
        all_transactions = []
        
        # Determine strategy: Try reading PDF structure first
        try:
            reader = pypdf.PdfReader(temp_path)
            num_pages = len(reader.pages)
            
            for i in range(num_pages):
                logging.info(f"Processing page {i+1}/{num_pages}")
                
                # Attempt Text Extraction
                try:
                    text_content = reader.pages[i].extract_text()
                except Exception:
                    text_content = "" # Force fallback if extraction fails

                # LOGIC: Check if text is sufficient. If < 50 chars, it's likely a scan or image-heavy.
                if text_content and len(text_content.strip()) > 50:
                    # Strategy A: Text Mode
                    logging.info("Text detected. Using Text Strategy.")
                    result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT)
                else:
                    # Strategy B: Vision Fallback (Point 4)
                    logging.info("Low text/Encryption detected. Switching to Vision Strategy.")
                    if PDF_IMAGE_SUPPORT:
                        # Page numbers in pypdf are 0-indexed, pdf2image uses 1-based indexing often, 
                        # but convert_from_path handles slicing via first_page/last_page (1-based)
                        txs = process_pdf_page_as_image(model, temp_path, i+1)
                        all_transactions.extend(txs)
                        continue # Skip the rest of loop
                    else:
                        logging.warning("Cannot process scanned PDF - pdf2image missing.")
                        result = {"transactions": []}

                txs = result.get('transactions', [])
                all_transactions.extend(txs)

        except pypdf.errors.PdfReadError:
            # If pypdf fails completely (e.g., highly corrupted or weird encryption), try Vision on whole file
            logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
            if PDF_IMAGE_SUPPORT:
                # Warning: Processing all pages as images might be slow
                images = convert_from_path(temp_path)
                for img in images:
                    result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
                    all_transactions.extend(result.get('transactions', []))
            else:
                raise ValueError("PDF is unreadable and Vision fallback is unavailable.")

        return jsonify({'transactions': all_transactions})

    except Exception as e:
        logging.error(f"Server Error: {e}")
        return jsonify({'error': str(e)}), 500
    finally:
        if temp_path and os.path.exists(temp_path):
            os.remove(temp_path)

# -------------------------------------------------------------------------
# TEXT & IMAGE ENDPOINTS (UPDATED)
# -------------------------------------------------------------------------

@app.route('/process-text', methods=['POST'])
def process_text():
    """Handle raw text input."""
    try:
        data = request.get_json()
        if not data or 'text' not in data:
            return jsonify({'error': 'No text provided'}), 400
        
        text_input = data['text']
        if not text_input.strip():
            return jsonify({'error': 'Text input cannot be empty'}), 400 # Point 2
        
        model = configure_gemini(api_key)
        # Use specific prompt with date fallback for raw text
        prompt = get_text_prompt_with_fallback_date()
        
        result = call_gemini_with_retry(model, text_input, prompt)
        return jsonify({'transactions': result.get('transactions', [])})
        
    except Exception as e:
        logging.error(f"Error: {e}")
        return jsonify({'error': str(e)}), 500

@app.route('/process-image', methods=['POST'])
def process_image():
    """Handle raw image upload (Receipts, Photos of invoices)."""
    temp_path = None
    try:
        if 'file' not in request.files:
            return jsonify({'error': 'No file uploaded'}), 400
        file = request.files['file']
        
        # Point 2: Empty check
        file.seek(0, os.SEEK_END)
        size = file.tell()
        file.seek(0)
        if size == 0:
            return jsonify({'error': 'File is empty'}), 400

        with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as tmp:
            file.save(tmp.name)
            temp_path = tmp.name

        model = configure_gemini(api_key)
        
        # Load image with PIL
        img = Image.open(temp_path)
        
        # Use the General Financial Prompt
        result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
        
        return jsonify({'transactions': result.get('transactions', [])})
    
    except Exception as e:
        logging.error(f"Error: {e}")
        return jsonify({'error': str(e)}), 500
    finally:
        if temp_path and os.path.exists(temp_path):
            os.remove(temp_path)

@app.route('/transaction-types', methods=['GET'])
def get_transaction_types():
    """Return available transaction types and their categories."""
    # Kept identical for backwards compatibility
    transaction_types = {
        "types": [
            {
                "type": "income",
                "description": "Money received from customers, sales, services rendered",
                "destination_categories": ["income"]
            },
            {
                "type": "expense",
                "description": "Operational costs, purchases, payments made",
                "destination_categories": [
                    "Water and electricity", "Salaries and wages", "Repairs & Maintenance",
                    "Motor vehicle expenses", "Projects Expenses", "Hardware expenses",
                    "Refunds", "Accounting fees", "Loan interest", "Bank charges",
                    "Insurance", "SARS PAYE UIF", "Advertising & Marketing",
                    "Logistics and distribution", "Fuel", "Website hosting fees",
                    "Rentals", "Subscriptions", "Computer internet and Telephone",
                    "Staff training", "Travel and accommodation", "Depreciation",
                    "Other expenses"
                ]
            },
            {
                "type": "asset",
                "description": "Purchase of equipment, property, vehicles, or other assets",
                "destination_categories": [
                    "Equipment", "Property", "Vehicles", "Technology", "Furniture", "Other assets"
                ]
            },
            {
                "type": "liability",
                "description": "Taking on debt, loans received, credit facilities",
                "destination_categories": [
                    "Bank loan", "Credit facility", "Supplier credit", "Other liabilities"
                ]
            },
            {
                "type": "equity",
                "description": "Owner investments, capital contributions, retained earnings transfers",
                "destination_categories": [
                    "Owner investment", "Retained earnings", "Share capital", "Other equity"
                ]
            },
            {
                "type": "transfer",
                "description": "Money moved between own accounts, internal transfers",
                "destination_categories": ["Internal transfer"]
            },
            {
                "type": "investment",
                "description": "Securities purchases, investment account funding, portfolio additions",
                "destination_categories": [
                    "Securities", "Mutual funds", "Fixed deposits", "Other investments"
                ]
            },
            {
                "type": "loan_repayment",
                "description": "Paying back borrowed money, loan principal payments",
                "destination_categories": ["Loan repayment"]
            },
            {
                "type": "capital_injection",
                "description": "Owner or investor adding money to the business",
                "destination_categories": ["Capital injection"]
            }
        ]
    }
    return jsonify(transaction_types)

@app.route('/health', methods=['GET'])
def health_check():
    return jsonify({
        'status': 'healthy',
        'timestamp': datetime.now().isoformat(),
        'version': '2.2.0',
        'vision_support': PDF_IMAGE_SUPPORT
    })

if __name__ == '__main__':
    # Ensure this port matches your server configuration
    app.run(debug=True, host="0.0.0.0", port=7860)