|
|
import os |
|
|
import json |
|
|
import logging |
|
|
import re |
|
|
import tempfile |
|
|
import time |
|
|
from datetime import datetime |
|
|
from io import BytesIO |
|
|
|
|
|
|
|
|
from flask import Flask, request, jsonify |
|
|
from flask_cors import CORS |
|
|
import pandas as pd |
|
|
import pypdf |
|
|
import google.generativeai as genai |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
try: |
|
|
from pdf2image import convert_from_path |
|
|
PDF_IMAGE_SUPPORT = True |
|
|
except ImportError: |
|
|
PDF_IMAGE_SUPPORT = False |
|
|
logging.warning("pdf2image not installed. Scanned/Encrypted PDF fallback will not work.") |
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
|
|
app = Flask(__name__) |
|
|
CORS(app) |
|
|
|
|
|
|
|
|
api_key = os.getenv('Gemini') |
|
|
if not api_key: |
|
|
|
|
|
logging.warning("Gemini API key not found in environment variables.") |
|
|
|
|
|
def configure_gemini(api_key): |
|
|
"""Configure Gemini AI model.""" |
|
|
try: |
|
|
genai.configure(api_key=api_key) |
|
|
|
|
|
return genai.GenerativeModel('gemini-2.0-flash') |
|
|
except Exception as e: |
|
|
logging.error(f"Error configuring Gemini: {str(e)}") |
|
|
raise |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
FINANCIAL_DOC_PROMPT = """Analyze this financial document (which could be a Bank Statement, Invoice, Receipt, or Transaction List). |
|
|
Extract all relevant transactions/items in JSON format. |
|
|
|
|
|
RULES: |
|
|
1. **Dates**: Extract the date printed on the document. Format as DD/MM/YYYY. |
|
|
- If the year is missing in the row, use the document's context (e.g., header date). |
|
|
- Do NOT use the current date (today) unless the document explicitly says "Today". |
|
|
2. **Amounts**: Extract the EXACT amount including decimals. DO NOT ROUND. |
|
|
3. **Ignore**: Opening/Closing balances, page numbers, or cumulative running totals. |
|
|
|
|
|
FIELDS TO EXTRACT: |
|
|
- Date: string (DD/MM/YYYY) |
|
|
- Description: string (Full description of item/transaction) |
|
|
- Amount: number (Float, exact value) |
|
|
- Type: string (Categorize exactly as one of: 'income', 'expense', 'asset', 'liability', 'equity', 'transfer', 'investment', 'loan_repayment', 'capital_injection') |
|
|
- Customer_name: string (If 'income', name of payer. If 'expense', name of payee/vendor. Else 'N/A') |
|
|
- City: string (Extract from address if present, else 'N/A') |
|
|
- Document_Type: string (Infer: 'statement', 'invoice', 'receipt', 'transaction_list') |
|
|
- Destination_of_funds: string (Categorize based on description. e.g., 'Salaries', 'Fuel', 'Rentals', 'Equipment', etc.) |
|
|
|
|
|
RETURN STRUCTURE: |
|
|
{ |
|
|
"transactions": [ |
|
|
{ |
|
|
"Date": "DD/MM/YYYY", |
|
|
"Description": "Item Description", |
|
|
"Customer_name": "Vendor or Payer", |
|
|
"City": "City Name", |
|
|
"Amount": 123.45, |
|
|
"Type": "expense", |
|
|
"Destination_of_funds": "Category", |
|
|
"Document_Type": "invoice" |
|
|
} |
|
|
] |
|
|
} |
|
|
|
|
|
Return ONLY raw JSON. No markdown formatting. |
|
|
""" |
|
|
|
|
|
def get_text_prompt_with_fallback_date(): |
|
|
""" |
|
|
Generate prompt for raw text snippets where context might be missing. |
|
|
Only allows current date fallback for raw text, not PDFs. |
|
|
""" |
|
|
current_date = datetime.now().strftime("%d/%m/%Y") |
|
|
return f"""IMPORTANT: Today's date is {current_date}. |
|
|
If the text below does not specify a year or date, reasonable assume {current_date} context, but prefer explicit dates in text. |
|
|
|
|
|
{FINANCIAL_DOC_PROMPT} |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def categorize_transaction(transaction): |
|
|
""" |
|
|
Categorizes a transaction based strictly on its Type field. |
|
|
This prevents keyword-based misclassification. |
|
|
|
|
|
Args: |
|
|
transaction: dict with keys including 'Type', 'Description', 'Destination_of_funds' |
|
|
|
|
|
Returns: |
|
|
dict with added 'Account_Category' field |
|
|
""" |
|
|
tx_type = transaction.get('Type', '').lower() |
|
|
description = transaction.get('Description', '').lower() |
|
|
destination = transaction.get('Destination_of_funds', '').lower() |
|
|
|
|
|
|
|
|
account_category = "Uncategorized" |
|
|
|
|
|
|
|
|
if tx_type == 'income': |
|
|
|
|
|
if any(keyword in description for keyword in ['sales', 'service', 'revenue', 'invoice']): |
|
|
account_category = "Sales Revenue" |
|
|
elif any(keyword in description for keyword in ['interest', 'dividend']): |
|
|
account_category = "Interest Income" |
|
|
elif any(keyword in description for keyword in ['transfer', 'deposit', 'payment']): |
|
|
|
|
|
account_category = "Other Income" |
|
|
else: |
|
|
account_category = "Other Income" |
|
|
|
|
|
|
|
|
elif tx_type == 'expense': |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if 'salaries' in destination or 'wages' in destination or 'salary' in description: |
|
|
account_category = "Salaries and Wages" |
|
|
elif 'water' in destination or 'electricity' in destination: |
|
|
account_category = "Water and Electricity" |
|
|
elif 'fuel' in destination or 'petrol' in description: |
|
|
account_category = "Fuel" |
|
|
elif 'rental' in destination or 'rent' in description: |
|
|
account_category = "Rentals" |
|
|
elif 'marketing' in destination or 'advertising' in destination: |
|
|
account_category = "Advertising & Marketing" |
|
|
elif 'repair' in destination or 'maintenance' in destination: |
|
|
account_category = "Repairs & Maintenance" |
|
|
elif 'vehicle' in destination or 'motor' in destination: |
|
|
account_category = "Motor Vehicle Expenses" |
|
|
elif 'hardware' in destination: |
|
|
account_category = "Hardware Expenses" |
|
|
elif 'accounting' in destination: |
|
|
account_category = "Accounting Fees" |
|
|
elif 'insurance' in destination: |
|
|
account_category = "Insurance" |
|
|
elif 'bank' in destination and 'charge' in destination: |
|
|
account_category = "Bank Charges" |
|
|
elif 'loan' in destination and 'interest' in destination: |
|
|
account_category = "Loan Interest" |
|
|
elif 'subscription' in destination: |
|
|
account_category = "Subscriptions" |
|
|
elif 'internet' in destination or 'telephone' in destination: |
|
|
account_category = "Computer Internet and Telephone" |
|
|
elif 'training' in destination: |
|
|
account_category = "Staff Training" |
|
|
elif 'travel' in destination or 'accommodation' in destination: |
|
|
account_category = "Travel and Accommodation" |
|
|
elif 'depreciation' in destination: |
|
|
account_category = "Depreciation" |
|
|
|
|
|
|
|
|
elif 'atm' in description and 'cash' in description: |
|
|
|
|
|
account_category = "Owner's Drawings" |
|
|
elif 'payment to' in description: |
|
|
|
|
|
if any(word in description for word in ['fabric', 'printing', 'material']): |
|
|
account_category = "Cost of Sales" |
|
|
else: |
|
|
account_category = "Miscellaneous Expense" |
|
|
else: |
|
|
account_category = "Miscellaneous Expense" |
|
|
|
|
|
|
|
|
elif tx_type == 'asset': |
|
|
if 'equipment' in destination or 'equipment' in description: |
|
|
account_category = "Equipment" |
|
|
elif 'vehicle' in destination or 'vehicle' in description: |
|
|
account_category = "Vehicles" |
|
|
elif 'property' in destination or 'property' in description: |
|
|
account_category = "Property" |
|
|
elif 'technology' in destination or 'computer' in description: |
|
|
account_category = "Technology" |
|
|
elif 'furniture' in destination: |
|
|
account_category = "Furniture" |
|
|
else: |
|
|
account_category = "Other Assets" |
|
|
|
|
|
|
|
|
elif tx_type == 'liability': |
|
|
if 'bank loan' in destination or 'loan' in description: |
|
|
account_category = "Bank Loan" |
|
|
elif 'credit' in destination: |
|
|
account_category = "Credit Facility" |
|
|
else: |
|
|
account_category = "Other Liabilities" |
|
|
|
|
|
|
|
|
elif tx_type == 'equity': |
|
|
if 'owner' in destination or 'capital' in description: |
|
|
account_category = "Owner Investment" |
|
|
elif 'retained' in destination: |
|
|
account_category = "Retained Earnings" |
|
|
else: |
|
|
account_category = "Other Equity" |
|
|
|
|
|
|
|
|
elif tx_type == 'transfer': |
|
|
account_category = "Internal Transfer" |
|
|
|
|
|
|
|
|
elif tx_type == 'investment': |
|
|
if 'securities' in destination or 'stock' in description: |
|
|
account_category = "Securities" |
|
|
elif 'mutual' in destination: |
|
|
account_category = "Mutual Funds" |
|
|
else: |
|
|
account_category = "Other Investments" |
|
|
|
|
|
|
|
|
elif tx_type == 'loan_repayment': |
|
|
account_category = "Loan Repayment" |
|
|
|
|
|
|
|
|
elif tx_type == 'capital_injection': |
|
|
account_category = "Capital Injection" |
|
|
|
|
|
|
|
|
transaction['Account_Category'] = account_category |
|
|
return transaction |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_json_from_response(response_text): |
|
|
"""Extract valid JSON from Gemini's response, handling Markdown fences.""" |
|
|
|
|
|
cleaned_text = re.sub(r'```json\s*', '', response_text) |
|
|
cleaned_text = re.sub(r'```\s*', '', cleaned_text) |
|
|
|
|
|
|
|
|
match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL) |
|
|
if match: |
|
|
json_string = match.group(1) |
|
|
else: |
|
|
|
|
|
json_string = cleaned_text |
|
|
|
|
|
try: |
|
|
return json.loads(json_string) |
|
|
except json.JSONDecodeError: |
|
|
logging.warning("JSON parsing failed, attempting repair.") |
|
|
raise ValueError(json_string) |
|
|
|
|
|
def repair_json_with_gemini(model, broken_json_string): |
|
|
"""Uses Gemini to fix broken JSON syntax.""" |
|
|
repair_prompt = f"""Fix this broken JSON string. Return ONLY valid JSON. |
|
|
Broken JSON: {broken_json_string}""" |
|
|
try: |
|
|
resp = model.generate_content(repair_prompt) |
|
|
return extract_json_from_response(resp.text) |
|
|
except Exception as e: |
|
|
logging.error(f"JSON repair failed: {e}") |
|
|
return {"transactions": []} |
|
|
|
|
|
def call_gemini_with_retry(model, content, prompt, retries=2): |
|
|
""" |
|
|
Generic runner for Gemini. |
|
|
Args: |
|
|
content: Can be a String (text) or a PIL.Image object (vision). |
|
|
""" |
|
|
for attempt in range(retries + 1): |
|
|
try: |
|
|
|
|
|
response = model.generate_content([prompt, content]) |
|
|
|
|
|
try: |
|
|
result = extract_json_from_response(response.text) |
|
|
|
|
|
|
|
|
if 'transactions' in result: |
|
|
result['transactions'] = [ |
|
|
categorize_transaction(tx) for tx in result['transactions'] |
|
|
] |
|
|
|
|
|
return result |
|
|
except ValueError as ve: |
|
|
|
|
|
broken_json = str(ve) |
|
|
repaired = repair_json_with_gemini(model, broken_json) |
|
|
|
|
|
|
|
|
if 'transactions' in repaired: |
|
|
repaired['transactions'] = [ |
|
|
categorize_transaction(tx) for tx in repaired['transactions'] |
|
|
] |
|
|
|
|
|
return repaired |
|
|
|
|
|
except Exception as e: |
|
|
if "429" in str(e) or "ResourceExhausted" in str(e): |
|
|
time.sleep(2 * (attempt + 1)) |
|
|
continue |
|
|
logging.error(f"Gemini Error: {e}") |
|
|
if attempt == retries: |
|
|
raise |
|
|
|
|
|
return {"transactions": []} |
|
|
|
|
|
def is_file_empty(file_path): |
|
|
"""Check if file is empty.""" |
|
|
return os.path.getsize(file_path) == 0 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_pdf_page_as_image(model, pdf_path, page_num): |
|
|
"""Point 4: Convert specific PDF page to image and process with Vision.""" |
|
|
if not PDF_IMAGE_SUPPORT: |
|
|
raise ImportError("pdf2image/poppler not installed") |
|
|
|
|
|
|
|
|
|
|
|
images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num) |
|
|
if not images: |
|
|
return [] |
|
|
|
|
|
|
|
|
result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT) |
|
|
return result.get('transactions', []) |
|
|
|
|
|
@app.route('/process-pdf', methods=['POST']) |
|
|
def process_pdf(): |
|
|
""" |
|
|
Smart PDF Processor: |
|
|
1. Checks if empty. |
|
|
2. Tries standard Text extraction (Fast/Cheap). |
|
|
3. If Text fails (Encryption) or is empty (Scanned), falls back to Vision (Slow/Powerful). |
|
|
""" |
|
|
temp_path = None |
|
|
try: |
|
|
|
|
|
if 'file' not in request.files: |
|
|
return jsonify({'error': 'No file uploaded'}), 400 |
|
|
file = request.files['file'] |
|
|
if file.filename == '': |
|
|
return jsonify({'error': 'No file selected'}), 400 |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: |
|
|
file.save(tmp.name) |
|
|
temp_path = tmp.name |
|
|
|
|
|
|
|
|
if is_file_empty(temp_path): |
|
|
return jsonify({'error': 'Uploaded file is empty'}), 400 |
|
|
|
|
|
model = configure_gemini(api_key) |
|
|
all_transactions = [] |
|
|
|
|
|
|
|
|
try: |
|
|
reader = pypdf.PdfReader(temp_path) |
|
|
num_pages = len(reader.pages) |
|
|
|
|
|
for i in range(num_pages): |
|
|
logging.info(f"Processing page {i+1}/{num_pages}") |
|
|
|
|
|
|
|
|
try: |
|
|
text_content = reader.pages[i].extract_text() |
|
|
except Exception: |
|
|
text_content = "" |
|
|
|
|
|
|
|
|
if text_content and len(text_content.strip()) > 50: |
|
|
|
|
|
logging.info("Text detected. Using Text Strategy.") |
|
|
result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT) |
|
|
else: |
|
|
|
|
|
logging.info("Low text/Encryption detected. Switching to Vision Strategy.") |
|
|
if PDF_IMAGE_SUPPORT: |
|
|
|
|
|
|
|
|
txs = process_pdf_page_as_image(model, temp_path, i+1) |
|
|
all_transactions.extend(txs) |
|
|
continue |
|
|
else: |
|
|
logging.warning("Cannot process scanned PDF - pdf2image missing.") |
|
|
result = {"transactions": []} |
|
|
|
|
|
txs = result.get('transactions', []) |
|
|
all_transactions.extend(txs) |
|
|
|
|
|
except pypdf.errors.PdfReadError: |
|
|
|
|
|
logging.warning("pypdf failed to read file. Attempting full Vision fallback.") |
|
|
if PDF_IMAGE_SUPPORT: |
|
|
|
|
|
images = convert_from_path(temp_path) |
|
|
for img in images: |
|
|
result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT) |
|
|
all_transactions.extend(result.get('transactions', [])) |
|
|
else: |
|
|
raise ValueError("PDF is unreadable and Vision fallback is unavailable.") |
|
|
|
|
|
return jsonify({'transactions': all_transactions}) |
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"Server Error: {e}") |
|
|
return jsonify({'error': str(e)}), 500 |
|
|
finally: |
|
|
if temp_path and os.path.exists(temp_path): |
|
|
os.remove(temp_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@app.route('/process-text', methods=['POST']) |
|
|
def process_text(): |
|
|
"""Handle raw text input.""" |
|
|
try: |
|
|
data = request.get_json() |
|
|
if not data or 'text' not in data: |
|
|
return jsonify({'error': 'No text provided'}), 400 |
|
|
|
|
|
text_input = data['text'] |
|
|
if not text_input.strip(): |
|
|
return jsonify({'error': 'Text input cannot be empty'}), 400 |
|
|
|
|
|
model = configure_gemini(api_key) |
|
|
|
|
|
prompt = get_text_prompt_with_fallback_date() |
|
|
|
|
|
result = call_gemini_with_retry(model, text_input, prompt) |
|
|
return jsonify({'transactions': result.get('transactions', [])}) |
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"Error: {e}") |
|
|
return jsonify({'error': str(e)}), 500 |
|
|
|
|
|
@app.route('/process-image', methods=['POST']) |
|
|
def process_image(): |
|
|
"""Handle raw image upload (Receipts, Photos of invoices).""" |
|
|
temp_path = None |
|
|
try: |
|
|
if 'file' not in request.files: |
|
|
return jsonify({'error': 'No file uploaded'}), 400 |
|
|
file = request.files['file'] |
|
|
|
|
|
|
|
|
file.seek(0, os.SEEK_END) |
|
|
size = file.tell() |
|
|
file.seek(0) |
|
|
if size == 0: |
|
|
return jsonify({'error': 'File is empty'}), 400 |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.filename)[1]) as tmp: |
|
|
file.save(tmp.name) |
|
|
temp_path = tmp.name |
|
|
|
|
|
model = configure_gemini(api_key) |
|
|
|
|
|
|
|
|
img = Image.open(temp_path) |
|
|
|
|
|
|
|
|
result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT) |
|
|
|
|
|
return jsonify({'transactions': result.get('transactions', [])}) |
|
|
|
|
|
except Exception as e: |
|
|
logging.error(f"Error: {e}") |
|
|
return jsonify({'error': str(e)}), 500 |
|
|
finally: |
|
|
if temp_path and os.path.exists(temp_path): |
|
|
os.remove(temp_path) |
|
|
|
|
|
@app.route('/transaction-types', methods=['GET']) |
|
|
def get_transaction_types(): |
|
|
"""Return available transaction types and their categories.""" |
|
|
|
|
|
transaction_types = { |
|
|
"types": [ |
|
|
{ |
|
|
"type": "income", |
|
|
"description": "Money received from customers, sales, services rendered", |
|
|
"destination_categories": ["income"] |
|
|
}, |
|
|
{ |
|
|
"type": "expense", |
|
|
"description": "Operational costs, purchases, payments made", |
|
|
"destination_categories": [ |
|
|
"Water and electricity", "Salaries and wages", "Repairs & Maintenance", |
|
|
"Motor vehicle expenses", "Projects Expenses", "Hardware expenses", |
|
|
"Refunds", "Accounting fees", "Loan interest", "Bank charges", |
|
|
"Insurance", "SARS PAYE UIF", "Advertising & Marketing", |
|
|
"Logistics and distribution", "Fuel", "Website hosting fees", |
|
|
"Rentals", "Subscriptions", "Computer internet and Telephone", |
|
|
"Staff training", "Travel and accommodation", "Depreciation", |
|
|
"Other expenses" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"type": "asset", |
|
|
"description": "Purchase of equipment, property, vehicles, or other assets", |
|
|
"destination_categories": [ |
|
|
"Equipment", "Property", "Vehicles", "Technology", "Furniture", "Other assets" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"type": "liability", |
|
|
"description": "Taking on debt, loans received, credit facilities", |
|
|
"destination_categories": [ |
|
|
"Bank loan", "Credit facility", "Supplier credit", "Other liabilities" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"type": "equity", |
|
|
"description": "Owner investments, capital contributions, retained earnings transfers", |
|
|
"destination_categories": [ |
|
|
"Owner investment", "Retained earnings", "Share capital", "Other equity" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"type": "transfer", |
|
|
"description": "Money moved between own accounts, internal transfers", |
|
|
"destination_categories": ["Internal transfer"] |
|
|
}, |
|
|
{ |
|
|
"type": "investment", |
|
|
"description": "Securities purchases, investment account funding, portfolio additions", |
|
|
"destination_categories": [ |
|
|
"Securities", "Mutual funds", "Fixed deposits", "Other investments" |
|
|
] |
|
|
}, |
|
|
{ |
|
|
"type": "loan_repayment", |
|
|
"description": "Paying back borrowed money, loan principal payments", |
|
|
"destination_categories": ["Loan repayment"] |
|
|
}, |
|
|
{ |
|
|
"type": "capital_injection", |
|
|
"description": "Owner or investor adding money to the business", |
|
|
"destination_categories": ["Capital injection"] |
|
|
} |
|
|
] |
|
|
} |
|
|
return jsonify(transaction_types) |
|
|
|
|
|
@app.route('/health', methods=['GET']) |
|
|
def health_check(): |
|
|
return jsonify({ |
|
|
'status': 'healthy', |
|
|
'timestamp': datetime.now().isoformat(), |
|
|
'version': '2.2.0', |
|
|
'vision_support': PDF_IMAGE_SUPPORT |
|
|
}) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
app.run(debug=True, host="0.0.0.0", port=7860) |