import os import json import logging import re import tempfile import time from datetime import datetime from io import BytesIO # Third-party imports from flask import Flask, request, jsonify from flask_cors import CORS import pandas as pd import pypdf import google.generativeai as genai from PIL import Image # specific import for image fallback try: from pdf2image import convert_from_path PDF_IMAGE_SUPPORT = True except ImportError: PDF_IMAGE_SUPPORT = False logging.warning("pdf2image not installed. Scanned/Encrypted PDF fallback will not work.") # Setup logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') app = Flask(__name__) CORS(app) # Get API key securely api_key = os.getenv('Gemini') if not api_key: # Fallback for local testing if env var not set, though env var is preferred logging.warning("Gemini API key not found in environment variables.") def configure_gemini(api_key): """Configure Gemini AI model.""" try: genai.configure(api_key=api_key) # Using 2.0 Flash as it has superior vision and long-context capabilities return genai.GenerativeModel('gemini-2.0-flash') except Exception as e: logging.error(f"Error configuring Gemini: {str(e)}") raise # ------------------------------------------------------------------------- # PROMPTS # ------------------------------------------------------------------------- # Enhanced Prompt for General Financial Documents (Statements, Invoices, Receipts) # Addresses Point 1 (Rounding/Dates) & Point 3 (Document Types) FINANCIAL_DOC_PROMPT = """Analyze this financial document (which could be a Bank Statement, Invoice, Receipt, or Transaction List). Extract all relevant transactions/items in JSON format. RULES: 1. **Dates**: Extract the date printed on the document. Format as DD/MM/YYYY. - If the year is missing in the row, use the document's context (e.g., header date). - Do NOT use the current date (today) unless the document explicitly says "Today". 2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND. 3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals. FIELDS TO EXTRACT: - Date: string (DD/MM/YYYY) - Description: string (Full description of item/transaction) - Amount: number (Float, exact value) - Type: string (Categorize exactly as one of: 'income', 'expense', 'asset', 'liability', 'equity', 'transfer', 'investment', 'loan_repayment', 'capital_injection') - Customer_name: string (If 'income', name of payer. If 'expense', name of payee/vendor. Else 'N/A') - City: string (Extract from address if present, else 'N/A') - Document_Type: string (Infer: 'statement', 'invoice', 'receipt', 'transaction_list') - Destination_of_funds: string (Categorize based on description. e.g., 'Salaries', 'Fuel', 'Rentals', 'Equipment', etc.) RETURN STRUCTURE: { "transactions": [ { "Date": "DD/MM/YYYY", "Description": "Item Description", "Customer_name": "Vendor or Payer", "City": "City Name", "Amount": 123.45, "Type": "expense", "Destination_of_funds": "Category", "Document_Type": "invoice" } ] } Return ONLY raw JSON. No markdown formatting. """ def get_text_prompt_with_fallback_date(): """ Generate prompt for raw text snippets where context might be missing. Only allows current date fallback for raw text, not PDFs. """ current_date = datetime.now().strftime("%d/%m/%Y") return f"""IMPORTANT: Today's date is {current_date}. If the text below does not specify a year or date, reasonable assume {current_date} context, but prefer explicit dates in text. {FINANCIAL_DOC_PROMPT} """ # ------------------------------------------------------------------------- # CATEGORIZATION LOGIC - TYPE-BASED (FIX FOR THE BUG) # ------------------------------------------------------------------------- def categorize_transaction(transaction): """ Categorizes a transaction based strictly on its Type field. This prevents keyword-based misclassification. Args: transaction: dict with keys including 'Type', 'Description', 'Destination_of_funds' Returns: dict with added 'Account_Category' field """ tx_type = transaction.get('Type', '').lower() description = transaction.get('Description', '').lower() destination = transaction.get('Destination_of_funds', '').lower() # Add the categorized account field account_category = "Uncategorized" # ========== INCOME TYPE ========== if tx_type == 'income': # All income should map to revenue accounts, NOT expenses if any(keyword in description for keyword in ['sales', 'service', 'revenue', 'invoice']): account_category = "Sales Revenue" elif any(keyword in description for keyword in ['interest', 'dividend']): account_category = "Interest Income" elif any(keyword in description for keyword in ['transfer', 'deposit', 'payment']): # This fixes the "Income Trap" - transfers FROM others are income account_category = "Other Income" else: account_category = "Other Income" # ========== EXPENSE TYPE ========== elif tx_type == 'expense': # Map based on Destination_of_funds or description keywords # This is TYPE-FIRST, so "cash" in description won't make it an asset # Specific expense categories based on your system if 'salaries' in destination or 'wages' in destination or 'salary' in description: account_category = "Salaries and Wages" elif 'water' in destination or 'electricity' in destination: account_category = "Water and Electricity" elif 'fuel' in destination or 'petrol' in description: account_category = "Fuel" elif 'rental' in destination or 'rent' in description: account_category = "Rentals" elif 'marketing' in destination or 'advertising' in destination: account_category = "Advertising & Marketing" elif 'repair' in destination or 'maintenance' in destination: account_category = "Repairs & Maintenance" elif 'vehicle' in destination or 'motor' in destination: account_category = "Motor Vehicle Expenses" elif 'hardware' in destination: account_category = "Hardware Expenses" elif 'accounting' in destination: account_category = "Accounting Fees" elif 'insurance' in destination: account_category = "Insurance" elif 'bank' in destination and 'charge' in destination: account_category = "Bank Charges" elif 'loan' in destination and 'interest' in destination: account_category = "Loan Interest" elif 'subscription' in destination: account_category = "Subscriptions" elif 'internet' in destination or 'telephone' in destination: account_category = "Computer Internet and Telephone" elif 'training' in destination: account_category = "Staff Training" elif 'travel' in destination or 'accommodation' in destination: account_category = "Travel and Accommodation" elif 'depreciation' in destination: account_category = "Depreciation" # Special cases based on description (but still respecting expense type) elif 'atm' in description and 'cash' in description: # This fixes the "Cash Trap" - ATM withdrawals are drawings, not assets account_category = "Owner's Drawings" elif 'payment to' in description: # Payment to suppliers/vendors if any(word in description for word in ['fabric', 'printing', 'material']): account_category = "Cost of Sales" else: account_category = "Miscellaneous Expense" else: account_category = "Miscellaneous Expense" # ========== ASSET TYPE ========== elif tx_type == 'asset': if 'equipment' in destination or 'equipment' in description: account_category = "Equipment" elif 'vehicle' in destination or 'vehicle' in description: account_category = "Vehicles" elif 'property' in destination or 'property' in description: account_category = "Property" elif 'technology' in destination or 'computer' in description: account_category = "Technology" elif 'furniture' in destination: account_category = "Furniture" else: account_category = "Other Assets" # ========== LIABILITY TYPE ========== elif tx_type == 'liability': if 'bank loan' in destination or 'loan' in description: account_category = "Bank Loan" elif 'credit' in destination: account_category = "Credit Facility" else: account_category = "Other Liabilities" # ========== EQUITY TYPE ========== elif tx_type == 'equity': if 'owner' in destination or 'capital' in description: account_category = "Owner Investment" elif 'retained' in destination: account_category = "Retained Earnings" else: account_category = "Other Equity" # ========== TRANSFER TYPE ========== elif tx_type == 'transfer': account_category = "Internal Transfer" # ========== INVESTMENT TYPE ========== elif tx_type == 'investment': if 'securities' in destination or 'stock' in description: account_category = "Securities" elif 'mutual' in destination: account_category = "Mutual Funds" else: account_category = "Other Investments" # ========== LOAN REPAYMENT TYPE ========== elif tx_type == 'loan_repayment': account_category = "Loan Repayment" # ========== CAPITAL INJECTION TYPE ========== elif tx_type == 'capital_injection': account_category = "Capital Injection" # Add the category to the transaction transaction['Account_Category'] = account_category return transaction # ------------------------------------------------------------------------- # HELPER FUNCTIONS # ------------------------------------------------------------------------- def extract_json_from_response(response_text): """Extract valid JSON from Gemini's response, handling Markdown fences.""" # Remove markdown code blocks cleaned_text = re.sub(r'```json\s*', '', response_text) cleaned_text = re.sub(r'```\s*', '', cleaned_text) # Find JSON object match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL) if match: json_string = match.group(1) else: # Fallback: assume the whole text is JSON json_string = cleaned_text try: return json.loads(json_string) except json.JSONDecodeError: logging.warning("JSON parsing failed, attempting repair.") raise ValueError(json_string) # Pass invalid string to caller for repair def repair_json_with_gemini(model, broken_json_string): """Uses Gemini to fix broken JSON syntax.""" repair_prompt = f"""Fix this broken JSON string. Return ONLY valid JSON. Broken JSON: {broken_json_string}""" try: resp = model.generate_content(repair_prompt) return extract_json_from_response(resp.text) except Exception as e: logging.error(f"JSON repair failed: {e}") return {"transactions": []} # Fail safe def call_gemini_with_retry(model, content, prompt, retries=2): """ Generic runner for Gemini. Args: content: Can be a String (text) or a PIL.Image object (vision). """ for attempt in range(retries + 1): try: # Gemini Python SDK handles [Prompt, Image] or [Prompt, Text] automatically response = model.generate_content([prompt, content]) try: result = extract_json_from_response(response.text) # POST-PROCESSING: Categorize each transaction based on Type if 'transactions' in result: result['transactions'] = [ categorize_transaction(tx) for tx in result['transactions'] ] return result except ValueError as ve: # Value error here contains the broken JSON string broken_json = str(ve) repaired = repair_json_with_gemini(model, broken_json) # Categorize repaired transactions too if 'transactions' in repaired: repaired['transactions'] = [ categorize_transaction(tx) for tx in repaired['transactions'] ] return repaired except Exception as e: if "429" in str(e) or "ResourceExhausted" in str(e): time.sleep(2 * (attempt + 1)) continue logging.error(f"Gemini Error: {e}") if attempt == retries: raise return {"transactions": []} def is_file_empty(file_path): """Check if file is empty.""" return os.path.getsize(file_path) == 0 # ------------------------------------------------------------------------- # CORE LOGIC: PDF PROCESSING (HYBRID TEXT + VISION) # ------------------------------------------------------------------------- def process_pdf_page_as_image(model, pdf_path, page_num): """Point 4: Convert specific PDF page to image and process with Vision.""" if not PDF_IMAGE_SUPPORT: raise ImportError("pdf2image/poppler not installed") # Convert specific page to image # first_page=page_num, last_page=page_num ensures we only convert 1 page at a time to save RAM images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num) if not images: return [] # Process the image result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT) return result.get('transactions', []) @app.route('/process-pdf', methods=['POST']) def process_pdf(): """ Smart PDF Processor: 1. Checks if empty. 2. Tries standard Text extraction (Fast/Cheap). 3. If Text fails (Encryption) or is empty (Scanned), falls back to Vision (Slow/Powerful). """ temp_path = None try: # 1. Validation if 'file' not in request.files: return jsonify({'error': 'No file uploaded'}), 400 file = request.files['file'] if file.filename == '': return jsonify({'error': 'No file selected'}), 400 # Save Temp with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: file.save(tmp.name) temp_path = tmp.name # Point 2: Empty File Check if is_file_empty(temp_path): return jsonify({'error': 'Uploaded file is empty'}), 400 model = configure_gemini(api_key) all_transactions = [] # Determine strategy: Try reading PDF structure first try: reader = pypdf.PdfReader(temp_path) num_pages = len(reader.pages) for i in range(num_pages): logging.info(f"Processing page {i+1}/{num_pages}") # Attempt Text Extraction try: text_content = reader.pages[i].extract_text() except Exception: text_content = "" # Force fallback if extraction fails # LOGIC: Check if text is sufficient. If < 50 chars, it's likely a scan or image-heavy. if text_content and len(text_content.strip()) > 50: # Strategy A: Text Mode logging.info("Text detected. Using Text Strategy.") result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT) else: # Strategy B: Vision Fallback (Point 4) logging.info("Low text/Encryption detected. Switching to Vision Strategy.") if PDF_IMAGE_SUPPORT: # Page numbers in pypdf are 0-indexed, pdf2image uses 1-based indexing often, # but convert_from_path handles slicing via first_page/last_page (1-based) txs = process_pdf_page_as_image(model, temp_path, i+1) all_transactions.extend(txs) continue # Skip the rest of loop else: logging.warning("Cannot process scanned PDF - pdf2image missing.") result = {"transactions": []} txs = result.get('transactions', []) all_transactions.extend(txs) except pypdf.errors.PdfReadError: # If pypdf fails completely (e.g., highly corrupted or weird encryption), try Vision on whole file logging.warning("pypdf failed to read file. Attempting full Vision fallback.") if PDF_IMAGE_SUPPORT: # Warning: Processing all pages as images might be slow images = convert_from_path(temp_path) for img in images: result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT) all_transactions.extend(result.get('transactions', [])) else: raise ValueError("PDF is unreadable and Vision fallback is unavailable.") return jsonify({'transactions': all_transactions}) except Exception as e: logging.error(f"Server Error: {e}") return jsonify({'error': str(e)}), 500 finally: if temp_path and os.path.exists(temp_path): os.remove(temp_path) # ------------------------------------------------------------------------- # TEXT & IMAGE ENDPOINTS (UPDATED) # ------------------------------------------------------------------------- @app.route('/process-text', methods=['POST']) def process_text(): """Handle raw text input.""" try: data = request.get_json() if not data or 'text' not in data: return jsonify({'error': 'No text provided'}), 400 text_input = data['text'] if not text_input.strip(): return jsonify({'error': 'Text input cannot be empty'}), 400 # Point 2 model = configure_gemini(api_key) # Use specific prompt with date fallback for raw text prompt = get_text_prompt_with_fallback_date() result = call_gemini_with_retry(model, text_input, prompt) return jsonify({'transactions': result.get('transactions', [])}) except Exception as e: logging.error(f"Error: {e}") return jsonify({'error': str(e)}), 500 @app.route('/process-image', methods=['POST']) def process_image(): """Handle raw image upload (Receipts, Photos of invoices).""" temp_path = None try: if 'file' not in request.files: return jsonify({'error': 'No file uploaded'}), 400 file = request.files['file'] # Point 2: Empty check file.seek(0, os.SEEK_END) size = file.tell() file.seek(0) if size == 0: return jsonify({'error': 'File is empty'}), 400 with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as tmp: file.save(tmp.name) temp_path = tmp.name model = configure_gemini(api_key) # Load image with PIL img = Image.open(temp_path) # Use the General Financial Prompt result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT) return jsonify({'transactions': result.get('transactions', [])}) except Exception as e: logging.error(f"Error: {e}") return jsonify({'error': str(e)}), 500 finally: if temp_path and os.path.exists(temp_path): os.remove(temp_path) @app.route('/transaction-types', methods=['GET']) def get_transaction_types(): """Return available transaction types and their categories.""" # Kept identical for backwards compatibility transaction_types = { "types": [ { "type": "income", "description": "Money received from customers, sales, services rendered", "destination_categories": ["income"] }, { "type": "expense", "description": "Operational costs, purchases, payments made", "destination_categories": [ "Water and electricity", "Salaries and wages", "Repairs & Maintenance", "Motor vehicle expenses", "Projects Expenses", "Hardware expenses", "Refunds", "Accounting fees", "Loan interest", "Bank charges", "Insurance", "SARS PAYE UIF", "Advertising & Marketing", "Logistics and distribution", "Fuel", "Website hosting fees", "Rentals", "Subscriptions", "Computer internet and Telephone", "Staff training", "Travel and accommodation", "Depreciation", "Other expenses" ] }, { "type": "asset", "description": "Purchase of equipment, property, vehicles, or other assets", "destination_categories": [ "Equipment", "Property", "Vehicles", "Technology", "Furniture", "Other assets" ] }, { "type": "liability", "description": "Taking on debt, loans received, credit facilities", "destination_categories": [ "Bank loan", "Credit facility", "Supplier credit", "Other liabilities" ] }, { "type": "equity", "description": "Owner investments, capital contributions, retained earnings transfers", "destination_categories": [ "Owner investment", "Retained earnings", "Share capital", "Other equity" ] }, { "type": "transfer", "description": "Money moved between own accounts, internal transfers", "destination_categories": ["Internal transfer"] }, { "type": "investment", "description": "Securities purchases, investment account funding, portfolio additions", "destination_categories": [ "Securities", "Mutual funds", "Fixed deposits", "Other investments" ] }, { "type": "loan_repayment", "description": "Paying back borrowed money, loan principal payments", "destination_categories": ["Loan repayment"] }, { "type": "capital_injection", "description": "Owner or investor adding money to the business", "destination_categories": ["Capital injection"] } ] } return jsonify(transaction_types) @app.route('/health', methods=['GET']) def health_check(): return jsonify({ 'status': 'healthy', 'timestamp': datetime.now().isoformat(), 'version': '2.2.0', 'vision_support': PDF_IMAGE_SUPPORT }) if __name__ == '__main__': # Ensure this port matches your server configuration app.run(debug=True, host="0.0.0.0", port=7860)