stmt-api / main.py
rairo's picture
Update main.py
ce82bf0 verified
import os
import json
import logging
import re
import tempfile
import time
from datetime import datetime
from io import BytesIO
# Third-party imports
from flask import Flask, request, jsonify
from flask_cors import CORS
import pandas as pd
import pypdf
import google.generativeai as genai
from PIL import Image
# specific import for image fallback
try:
from pdf2image import convert_from_path
PDF_IMAGE_SUPPORT = True
except ImportError:
PDF_IMAGE_SUPPORT = False
logging.warning("pdf2image not installed. Scanned/Encrypted PDF fallback will not work.")
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
app = Flask(__name__)
CORS(app)
# Get API key securely
api_key = os.getenv('Gemini')
if not api_key:
# Fallback for local testing if env var not set, though env var is preferred
logging.warning("Gemini API key not found in environment variables.")
def configure_gemini(api_key):
"""Configure Gemini AI model."""
try:
genai.configure(api_key=api_key)
# Using 2.0 Flash as it has superior vision and long-context capabilities
return genai.GenerativeModel('gemini-2.0-flash')
except Exception as e:
logging.error(f"Error configuring Gemini: {str(e)}")
raise
# -------------------------------------------------------------------------
# PROMPTS
# -------------------------------------------------------------------------
# Enhanced Prompt for General Financial Documents (Statements, Invoices, Receipts)
# Addresses Point 1 (Rounding/Dates) & Point 3 (Document Types)
FINANCIAL_DOC_PROMPT = """Analyze this financial document (which could be a Bank Statement, Invoice, Receipt, or Transaction List).
Extract all relevant transactions/items in JSON format.
RULES:
1. **Dates**: Extract the date printed on the document. Format as DD/MM/YYYY.
- If the year is missing in the row, use the document's context (e.g., header date).
- Do NOT use the current date (today) unless the document explicitly says "Today".
2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND.
3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals.
FIELDS TO EXTRACT:
- Date: string (DD/MM/YYYY)
- Description: string (Full description of item/transaction)
- Amount: number (Float, exact value)
- Type: string (Categorize exactly as one of: 'income', 'expense', 'asset', 'liability', 'equity', 'transfer', 'investment', 'loan_repayment', 'capital_injection')
- Customer_name: string (If 'income', name of payer. If 'expense', name of payee/vendor. Else 'N/A')
- City: string (Extract from address if present, else 'N/A')
- Document_Type: string (Infer: 'statement', 'invoice', 'receipt', 'transaction_list')
- Destination_of_funds: string (Categorize based on description. e.g., 'Salaries', 'Fuel', 'Rentals', 'Equipment', etc.)
RETURN STRUCTURE:
{
"transactions": [
{
"Date": "DD/MM/YYYY",
"Description": "Item Description",
"Customer_name": "Vendor or Payer",
"City": "City Name",
"Amount": 123.45,
"Type": "expense",
"Destination_of_funds": "Category",
"Document_Type": "invoice"
}
]
}
Return ONLY raw JSON. No markdown formatting.
"""
def get_text_prompt_with_fallback_date():
"""
Generate prompt for raw text snippets where context might be missing.
Only allows current date fallback for raw text, not PDFs.
"""
current_date = datetime.now().strftime("%d/%m/%Y")
return f"""IMPORTANT: Today's date is {current_date}.
If the text below does not specify a year or date, reasonable assume {current_date} context, but prefer explicit dates in text.
{FINANCIAL_DOC_PROMPT}
"""
# -------------------------------------------------------------------------
# CATEGORIZATION LOGIC - TYPE-BASED (FIX FOR THE BUG)
# -------------------------------------------------------------------------
def categorize_transaction(transaction):
"""
Categorizes a transaction based strictly on its Type field.
This prevents keyword-based misclassification.
Args:
transaction: dict with keys including 'Type', 'Description', 'Destination_of_funds'
Returns:
dict with added 'Account_Category' field
"""
tx_type = transaction.get('Type', '').lower()
description = transaction.get('Description', '').lower()
destination = transaction.get('Destination_of_funds', '').lower()
# Add the categorized account field
account_category = "Uncategorized"
# ========== INCOME TYPE ==========
if tx_type == 'income':
# All income should map to revenue accounts, NOT expenses
if any(keyword in description for keyword in ['sales', 'service', 'revenue', 'invoice']):
account_category = "Sales Revenue"
elif any(keyword in description for keyword in ['interest', 'dividend']):
account_category = "Interest Income"
elif any(keyword in description for keyword in ['transfer', 'deposit', 'payment']):
# This fixes the "Income Trap" - transfers FROM others are income
account_category = "Other Income"
else:
account_category = "Other Income"
# ========== EXPENSE TYPE ==========
elif tx_type == 'expense':
# Map based on Destination_of_funds or description keywords
# This is TYPE-FIRST, so "cash" in description won't make it an asset
# Specific expense categories based on your system
if 'salaries' in destination or 'wages' in destination or 'salary' in description:
account_category = "Salaries and Wages"
elif 'water' in destination or 'electricity' in destination:
account_category = "Water and Electricity"
elif 'fuel' in destination or 'petrol' in description:
account_category = "Fuel"
elif 'rental' in destination or 'rent' in description:
account_category = "Rentals"
elif 'marketing' in destination or 'advertising' in destination:
account_category = "Advertising & Marketing"
elif 'repair' in destination or 'maintenance' in destination:
account_category = "Repairs & Maintenance"
elif 'vehicle' in destination or 'motor' in destination:
account_category = "Motor Vehicle Expenses"
elif 'hardware' in destination:
account_category = "Hardware Expenses"
elif 'accounting' in destination:
account_category = "Accounting Fees"
elif 'insurance' in destination:
account_category = "Insurance"
elif 'bank' in destination and 'charge' in destination:
account_category = "Bank Charges"
elif 'loan' in destination and 'interest' in destination:
account_category = "Loan Interest"
elif 'subscription' in destination:
account_category = "Subscriptions"
elif 'internet' in destination or 'telephone' in destination:
account_category = "Computer Internet and Telephone"
elif 'training' in destination:
account_category = "Staff Training"
elif 'travel' in destination or 'accommodation' in destination:
account_category = "Travel and Accommodation"
elif 'depreciation' in destination:
account_category = "Depreciation"
# Special cases based on description (but still respecting expense type)
elif 'atm' in description and 'cash' in description:
# This fixes the "Cash Trap" - ATM withdrawals are drawings, not assets
account_category = "Owner's Drawings"
elif 'payment to' in description:
# Payment to suppliers/vendors
if any(word in description for word in ['fabric', 'printing', 'material']):
account_category = "Cost of Sales"
else:
account_category = "Miscellaneous Expense"
else:
account_category = "Miscellaneous Expense"
# ========== ASSET TYPE ==========
elif tx_type == 'asset':
if 'equipment' in destination or 'equipment' in description:
account_category = "Equipment"
elif 'vehicle' in destination or 'vehicle' in description:
account_category = "Vehicles"
elif 'property' in destination or 'property' in description:
account_category = "Property"
elif 'technology' in destination or 'computer' in description:
account_category = "Technology"
elif 'furniture' in destination:
account_category = "Furniture"
else:
account_category = "Other Assets"
# ========== LIABILITY TYPE ==========
elif tx_type == 'liability':
if 'bank loan' in destination or 'loan' in description:
account_category = "Bank Loan"
elif 'credit' in destination:
account_category = "Credit Facility"
else:
account_category = "Other Liabilities"
# ========== EQUITY TYPE ==========
elif tx_type == 'equity':
if 'owner' in destination or 'capital' in description:
account_category = "Owner Investment"
elif 'retained' in destination:
account_category = "Retained Earnings"
else:
account_category = "Other Equity"
# ========== TRANSFER TYPE ==========
elif tx_type == 'transfer':
account_category = "Internal Transfer"
# ========== INVESTMENT TYPE ==========
elif tx_type == 'investment':
if 'securities' in destination or 'stock' in description:
account_category = "Securities"
elif 'mutual' in destination:
account_category = "Mutual Funds"
else:
account_category = "Other Investments"
# ========== LOAN REPAYMENT TYPE ==========
elif tx_type == 'loan_repayment':
account_category = "Loan Repayment"
# ========== CAPITAL INJECTION TYPE ==========
elif tx_type == 'capital_injection':
account_category = "Capital Injection"
# Add the category to the transaction
transaction['Account_Category'] = account_category
return transaction
# -------------------------------------------------------------------------
# HELPER FUNCTIONS
# -------------------------------------------------------------------------
def extract_json_from_response(response_text):
"""Extract valid JSON from Gemini's response, handling Markdown fences."""
# Remove markdown code blocks
cleaned_text = re.sub(r'```json\s*', '', response_text)
cleaned_text = re.sub(r'```\s*', '', cleaned_text)
# Find JSON object
match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
if match:
json_string = match.group(1)
else:
# Fallback: assume the whole text is JSON
json_string = cleaned_text
try:
return json.loads(json_string)
except json.JSONDecodeError:
logging.warning("JSON parsing failed, attempting repair.")
raise ValueError(json_string) # Pass invalid string to caller for repair
def repair_json_with_gemini(model, broken_json_string):
"""Uses Gemini to fix broken JSON syntax."""
repair_prompt = f"""Fix this broken JSON string. Return ONLY valid JSON.
Broken JSON: {broken_json_string}"""
try:
resp = model.generate_content(repair_prompt)
return extract_json_from_response(resp.text)
except Exception as e:
logging.error(f"JSON repair failed: {e}")
return {"transactions": []} # Fail safe
def call_gemini_with_retry(model, content, prompt, retries=2):
"""
Generic runner for Gemini.
Args:
content: Can be a String (text) or a PIL.Image object (vision).
"""
for attempt in range(retries + 1):
try:
# Gemini Python SDK handles [Prompt, Image] or [Prompt, Text] automatically
response = model.generate_content([prompt, content])
try:
result = extract_json_from_response(response.text)
# POST-PROCESSING: Categorize each transaction based on Type
if 'transactions' in result:
result['transactions'] = [
categorize_transaction(tx) for tx in result['transactions']
]
return result
except ValueError as ve:
# Value error here contains the broken JSON string
broken_json = str(ve)
repaired = repair_json_with_gemini(model, broken_json)
# Categorize repaired transactions too
if 'transactions' in repaired:
repaired['transactions'] = [
categorize_transaction(tx) for tx in repaired['transactions']
]
return repaired
except Exception as e:
if "429" in str(e) or "ResourceExhausted" in str(e):
time.sleep(2 * (attempt + 1))
continue
logging.error(f"Gemini Error: {e}")
if attempt == retries:
raise
return {"transactions": []}
def is_file_empty(file_path):
"""Check if file is empty."""
return os.path.getsize(file_path) == 0
# -------------------------------------------------------------------------
# CORE LOGIC: PDF PROCESSING (HYBRID TEXT + VISION)
# -------------------------------------------------------------------------
def process_pdf_page_as_image(model, pdf_path, page_num):
"""Point 4: Convert specific PDF page to image and process with Vision."""
if not PDF_IMAGE_SUPPORT:
raise ImportError("pdf2image/poppler not installed")
# Convert specific page to image
# first_page=page_num, last_page=page_num ensures we only convert 1 page at a time to save RAM
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
if not images:
return []
# Process the image
result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
return result.get('transactions', [])
@app.route('/process-pdf', methods=['POST'])
def process_pdf():
"""
Smart PDF Processor:
1. Checks if empty.
2. Tries standard Text extraction (Fast/Cheap).
3. If Text fails (Encryption) or is empty (Scanned), falls back to Vision (Slow/Powerful).
"""
temp_path = None
try:
# 1. Validation
if 'file' not in request.files:
return jsonify({'error': 'No file uploaded'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
# Save Temp
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
file.save(tmp.name)
temp_path = tmp.name
# Point 2: Empty File Check
if is_file_empty(temp_path):
return jsonify({'error': 'Uploaded file is empty'}), 400
model = configure_gemini(api_key)
all_transactions = []
# Determine strategy: Try reading PDF structure first
try:
reader = pypdf.PdfReader(temp_path)
num_pages = len(reader.pages)
for i in range(num_pages):
logging.info(f"Processing page {i+1}/{num_pages}")
# Attempt Text Extraction
try:
text_content = reader.pages[i].extract_text()
except Exception:
text_content = "" # Force fallback if extraction fails
# LOGIC: Check if text is sufficient. If < 50 chars, it's likely a scan or image-heavy.
if text_content and len(text_content.strip()) > 50:
# Strategy A: Text Mode
logging.info("Text detected. Using Text Strategy.")
result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT)
else:
# Strategy B: Vision Fallback (Point 4)
logging.info("Low text/Encryption detected. Switching to Vision Strategy.")
if PDF_IMAGE_SUPPORT:
# Page numbers in pypdf are 0-indexed, pdf2image uses 1-based indexing often,
# but convert_from_path handles slicing via first_page/last_page (1-based)
txs = process_pdf_page_as_image(model, temp_path, i+1)
all_transactions.extend(txs)
continue # Skip the rest of loop
else:
logging.warning("Cannot process scanned PDF - pdf2image missing.")
result = {"transactions": []}
txs = result.get('transactions', [])
all_transactions.extend(txs)
except pypdf.errors.PdfReadError:
# If pypdf fails completely (e.g., highly corrupted or weird encryption), try Vision on whole file
logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
if PDF_IMAGE_SUPPORT:
# Warning: Processing all pages as images might be slow
images = convert_from_path(temp_path)
for img in images:
result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
all_transactions.extend(result.get('transactions', []))
else:
raise ValueError("PDF is unreadable and Vision fallback is unavailable.")
return jsonify({'transactions': all_transactions})
except Exception as e:
logging.error(f"Server Error: {e}")
return jsonify({'error': str(e)}), 500
finally:
if temp_path and os.path.exists(temp_path):
os.remove(temp_path)
# -------------------------------------------------------------------------
# TEXT & IMAGE ENDPOINTS (UPDATED)
# -------------------------------------------------------------------------
@app.route('/process-text', methods=['POST'])
def process_text():
"""Handle raw text input."""
try:
data = request.get_json()
if not data or 'text' not in data:
return jsonify({'error': 'No text provided'}), 400
text_input = data['text']
if not text_input.strip():
return jsonify({'error': 'Text input cannot be empty'}), 400 # Point 2
model = configure_gemini(api_key)
# Use specific prompt with date fallback for raw text
prompt = get_text_prompt_with_fallback_date()
result = call_gemini_with_retry(model, text_input, prompt)
return jsonify({'transactions': result.get('transactions', [])})
except Exception as e:
logging.error(f"Error: {e}")
return jsonify({'error': str(e)}), 500
@app.route('/process-image', methods=['POST'])
def process_image():
"""Handle raw image upload (Receipts, Photos of invoices)."""
temp_path = None
try:
if 'file' not in request.files:
return jsonify({'error': 'No file uploaded'}), 400
file = request.files['file']
# Point 2: Empty check
file.seek(0, os.SEEK_END)
size = file.tell()
file.seek(0)
if size == 0:
return jsonify({'error': 'File is empty'}), 400
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as tmp:
file.save(tmp.name)
temp_path = tmp.name
model = configure_gemini(api_key)
# Load image with PIL
img = Image.open(temp_path)
# Use the General Financial Prompt
result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
return jsonify({'transactions': result.get('transactions', [])})
except Exception as e:
logging.error(f"Error: {e}")
return jsonify({'error': str(e)}), 500
finally:
if temp_path and os.path.exists(temp_path):
os.remove(temp_path)
@app.route('/transaction-types', methods=['GET'])
def get_transaction_types():
"""Return available transaction types and their categories."""
# Kept identical for backwards compatibility
transaction_types = {
"types": [
{
"type": "income",
"description": "Money received from customers, sales, services rendered",
"destination_categories": ["income"]
},
{
"type": "expense",
"description": "Operational costs, purchases, payments made",
"destination_categories": [
"Water and electricity", "Salaries and wages", "Repairs & Maintenance",
"Motor vehicle expenses", "Projects Expenses", "Hardware expenses",
"Refunds", "Accounting fees", "Loan interest", "Bank charges",
"Insurance", "SARS PAYE UIF", "Advertising & Marketing",
"Logistics and distribution", "Fuel", "Website hosting fees",
"Rentals", "Subscriptions", "Computer internet and Telephone",
"Staff training", "Travel and accommodation", "Depreciation",
"Other expenses"
]
},
{
"type": "asset",
"description": "Purchase of equipment, property, vehicles, or other assets",
"destination_categories": [
"Equipment", "Property", "Vehicles", "Technology", "Furniture", "Other assets"
]
},
{
"type": "liability",
"description": "Taking on debt, loans received, credit facilities",
"destination_categories": [
"Bank loan", "Credit facility", "Supplier credit", "Other liabilities"
]
},
{
"type": "equity",
"description": "Owner investments, capital contributions, retained earnings transfers",
"destination_categories": [
"Owner investment", "Retained earnings", "Share capital", "Other equity"
]
},
{
"type": "transfer",
"description": "Money moved between own accounts, internal transfers",
"destination_categories": ["Internal transfer"]
},
{
"type": "investment",
"description": "Securities purchases, investment account funding, portfolio additions",
"destination_categories": [
"Securities", "Mutual funds", "Fixed deposits", "Other investments"
]
},
{
"type": "loan_repayment",
"description": "Paying back borrowed money, loan principal payments",
"destination_categories": ["Loan repayment"]
},
{
"type": "capital_injection",
"description": "Owner or investor adding money to the business",
"destination_categories": ["Capital injection"]
}
]
}
return jsonify(transaction_types)
@app.route('/health', methods=['GET'])
def health_check():
return jsonify({
'status': 'healthy',
'timestamp': datetime.now().isoformat(),
'version': '2.2.0',
'vision_support': PDF_IMAGE_SUPPORT
})
if __name__ == '__main__':
# Ensure this port matches your server configuration
app.run(debug=True, host="0.0.0.0", port=7860)