tunasonga-api

Sleeping

App Files Files Community

rairo commited on May 2, 2025

Commit

ac4c899

verified ·

1 Parent(s): 4c92169

Update main.py

Browse files

Files changed (1) hide show

main.py +244 -10

main.py CHANGED Viewed

@@ -1,16 +1,20 @@
 import os
 import json
-import faiss
 import numpy as np
 import pickle
 from flask import Flask, request, jsonify
 from flask_cors import CORS
-import firebase_admin
-from firebase_admin import credentials, firestore
 from dotenv import load_dotenv
 from google import genai
-from google.genai import types
 load_dotenv()
@@ -18,19 +22,26 @@ load_dotenv()
 app = Flask(__name__)
 CORS(app)
 cred_json = os.environ.get("FIREBASE")
-if cred_json:
-    cred = credentials.Certificate(json.loads(cred_json))
-    firebase_admin.initialize_app(cred)
-fs = firestore.client()
 # --------- Google GenAI Client ---------
 client = genai.Client(api_key=os.getenv("Gemini"))
-# --------- FAISS Cache Paths ---------
 INDEX_PATH = "vector.index"
 DOCS_PATH  = "documents.pkl"
 # --------- Fetch & Summarize Firestore Docs ---------
 def fetch_documents() -> list[str]:
     docs: list[str] = []
@@ -142,7 +153,82 @@ def retrieve_and_respond(user_query: str, top_k: int = 3) -> str:
     resp = chat.send_message(prompt)
     return resp.text
-# --------- Flask Endpoint ---------
 @app.route("/chat", methods=["POST"])
 def chat_endpoint():
     data = request.get_json(force=True)
@@ -154,5 +240,153 @@ def chat_endpoint():
     except Exception as e:
         return jsonify({"error": str(e)}), 500
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=True)

 import os
 import json
+import time
+from datetime import datetime
+from io import BytesIO
+import pypdf
+import firebase_admin
 import numpy as np
+import faiss
 import pickle
 from flask import Flask, request, jsonify
 from flask_cors import CORS
 from dotenv import load_dotenv
+from firebase_admin import credentials, firestore, storage
 from google import genai
 load_dotenv()
 app = Flask(__name__)
 CORS(app)
+# Initialize Firebase with Firestore + Storage
 cred_json = os.environ.get("FIREBASE")
+if not cred_json:
+    raise RuntimeError("Missing FIREBASE env var")
+cred = credentials.Certificate(json.loads(cred_json))
+firebase_admin.initialize_app(cred, {
+    "storageBucket": os.environ.get("FIREBASE_STORAGE_BUCKET")
+})
+fs      = firestore.client()
+bucket  = storage.bucket()
 # --------- Google GenAI Client ---------
 client = genai.Client(api_key=os.getenv("Gemini"))
+# --------- FAISS Cache Paths (unchanged) ---------
 INDEX_PATH = "vector.index"
 DOCS_PATH  = "documents.pkl"
 # --------- Fetch & Summarize Firestore Docs ---------
 def fetch_documents() -> list[str]:
     docs: list[str] = []
     resp = chat.send_message(prompt)
     return resp.text
+# --------- Helpers for Bank-Statement Processing ---------
+def read_pdf_pages(file_obj):
+    file_obj.seek(0)
+    reader = pypdf.PdfReader(file_obj)
+    return reader, len(reader.pages)
+def extract_page_text(reader, page_num):
+    if page_num < len(reader.pages):
+        return reader.pages[page_num].extract_text() or ""
+    return ""
+def process_with_gemini(model, text):
+    prompt = """Analyze this bank statement and extract transactions in JSON format with these fields:
+    - Date (format DD/MM/YYYY)
+    - Description
+    - Amount (just the integer value)
+    - Type (is 'income' if 'credit amount', else 'expense')
+    - Customer Name (Only If Type is 'income' and if no name is extracted write 'general income' and if type is not 'income' write 'expense')
+    - City (In address of bank statement)
+    - Category_of_expense (a string, if transaction 'Type' is 'expense' categorize it based on description into: Water and electricity, Salaries and wages, Repairs & Maintenance, Motor vehicle expenses, Projects Expenses, Hardware expenses, Refunds, Accounting fees, Loan interest, Bank charges, Insurance, SARS PAYE UIF, Advertising & Marketing, Logistics and distribution, Fuel, Website hosting fees, Rentals, Subscriptions, Computer internet and Telephone, Staff training, Travel and accommodation, Depreciation, Other expenses. If no category matches, default to 'Other expenses'. If 'Type' is 'income' set Destination_of_funds to 'income'.)
+    - ignore opening or closing balances, charts and analysis.
+    Return ONLY valid JSON with this structure:
+    {
+        "transactions": [
+            {
+                "Date": "string",
+                "Description": "string",
+                "Customer_name": "string",
+                "City": "string",
+                "Amount": number,
+                "Type": "string",
+                "Category_of_expense": "string"
+            }
+        ]
+    }"""
+    try:
+        resp = model.generate_content([prompt, text])
+        time.sleep(6)  # match your Streamlit rate-limit workaround
+        return resp.text
+    except Exception as e:
+        # retry once on 504
+        if hasattr(e, "response") and getattr(e.response, "status_code", None) == 504:
+            time.sleep(6)
+            resp = model.generate_content([prompt, text])
+            return resp.text
+        raise
+def process_pdf_pages(model, pdf_file):
+    reader, total_pages = read_pdf_pages(pdf_file)
+    all_txns = []
+    for pg in range(total_pages):
+        txt = extract_page_text(reader, pg).strip()
+        if not txt:
+            continue
+        try:
+            raw = process_with_gemini(model, txt)
+        except Exception:
+            continue
+        # grab the JSON blob
+        start = raw.find("{")
+        end   = raw.rfind("}") + 1
+        if start < 0 or end <= 0:
+            continue
+        js = raw[start:end].replace("```json", "").replace("```", "")
+        try:
+            data = json.loads(js)
+            all_txns.extend(data.get("transactions", []))
+        except json.JSONDecodeError:
+            continue
+    return all_txns
+# --------- Chat Endpoint ---------
 @app.route("/chat", methods=["POST"])
 def chat_endpoint():
     data = request.get_json(force=True)
     except Exception as e:
         return jsonify({"error": str(e)}), 500
+# --------- Endpoint: Upload & Store Bank Statements ---------
+@app.route("/upload_statements", methods=["POST"])
+def upload_statements():
+    """
+    Expects multipart/form-data:
+      - 'business_id': string
+      - 'files': one or more PDFs
+    Stores each PDF in Storage, extracts transactions, and writes them
+    to Firestore (collection 'transactions') with a 'business_id' tag.
+    """
+    business_id = request.form.get("business_id", "").strip()
+    if not business_id:
+        return jsonify({"error": "Missing business_id"}), 400
+    if "files" not in request.files:
+        return jsonify({"error": "No files part; upload under key 'files'"}), 400
+    files = request.files.getlist("files")
+    if not files:
+        return jsonify({"error": "No files uploaded"}), 400
+    # configure exactly as in your Streamlit code
+    model = genai.GenerativeModel("gemini-2.0-flash-thinking-exp")
+    stored_count = 0
+    for f in files:
+        filename = f.filename or "statement.pdf"
+        # upload raw PDF to storage
+        dest_path = f"{business_id}/bank_statements/{datetime.utcnow().isoformat()}_{filename}"
+        blob = bucket.blob(dest_path)
+        f.seek(0)
+        blob.upload_from_file(f, content_type=f.content_type)
+        # rewind for processing
+        f.seek(0)
+        # extract + store transactions
+        txns = process_pdf_pages(model, f)
+        for txn in txns:
+            try:
+                dt = datetime.strptime(txn["Date"], "%d/%m/%Y")
+            except Exception:
+                dt = datetime.utcnow()
+            record = {
+                "business_id": business_id,
+                "Date":        firestore.Timestamp.from_datetime(dt),
+                "Description": txn.get("Description", ""),
+                "Amount":      txn.get("Amount", 0),
+                "Type":        txn.get("Type", "expense"),
+                "Customer_name": txn.get("Customer_name",
+                                        "general income" if txn.get("Type")=="income" else "expense"),
+                "City":            txn.get("City", ""),
+                "Category_of_expense": txn.get("Category_of_expense", "")
+            }
+            fs.collection("transactions").add(record)
+            stored_count += 1
+    return jsonify({"message": f"Stored {stored_count} transactions"}), 200
+# --------- Endpoint: Retrieve or Generate Financial Statement ---------
+@app.route("/financial_statement", methods=["POST"])
+def financial_statement():
+    """
+    Expects JSON:
+      {
+        "business_id": "...",
+        "start_date":  "YYYY-MM-DD",
+        "end_date":    "YYYY-MM-DD",
+        "statement_type": "Income Statement"|"Cashflow Statement"|"Balance Sheet"
+      }
+    If a cached report exists for that exact (business_id, start,end), returns it.
+    Otherwise generates via Gemini, returns it, and caches it in Firestore.
+    """
+    data = request.get_json(force=True) or {}
+    biz = data.get("business_id", "").strip()
+    sd  = data.get("start_date", "")
+    ed  = data.get("end_date", "")
+    stype = data.get("statement_type", "Income Statement")
+    if not (biz and sd and ed):
+        return jsonify({"error": "Missing one of business_id, start_date, end_date"}), 400
+    # parse iso dates
+    try:
+        dt_start = datetime.fromisoformat(sd)
+        dt_end   = datetime.fromisoformat(ed)
+    except ValueError:
+        return jsonify({"error": "Dates must be YYYY-MM-DD"}), 400
+    # check cache
+    doc_id = f"{biz}__{sd}__{ed}__{stype.replace(' ','_')}"
+    doc_ref = fs.collection("financial_statements").document(doc_id)
+    cached = doc_ref.get()
+    if cached.exists:
+        return jsonify({"report": cached.to_dict()["report"], "cached": True}), 200
+    # fetch transactions
+    snaps = (
+        fs.collection("transactions")
+          .where("business_id", "==", biz)
+          .where("Date", ">=", firestore.Timestamp.from_datetime(dt_start))
+          .where("Date", "<=", firestore.Timestamp.from_datetime(dt_end))
+          .stream()
+    )
+    txns = []
+    for s in snaps:
+        d = s.to_dict()
+        txns.append({
+            "Date": d["Date"].to_datetime().strftime("%d/%m/%Y"),
+            "Description": d.get("Description",""),
+            "Amount":      d.get("Amount",0),
+            "Type":        d.get("Type",""),
+            "Customer_name": d.get("Customer_name",""),
+            "City":           d.get("City",""),
+            "Category_of_expense": d.get("Category_of_expense","")
+        })
+    if not txns:
+        return jsonify({"error": "No transactions found for that period"}), 404
+    # generate with Gemini
+    prompt = (
+        f"Based on the following transactions JSON data:\n"
+        f"{json.dumps({'transactions': txns})}\n"
+        f"Generate a detailed {stype} for the period from "
+        f"{dt_start.strftime('%d/%m/%Y')} to {dt_end.strftime('%d/%m/%Y')} "
+        f"in Markdown, following standard South African accounting practice, with headings, "
+        "subtotals, totals, key highlights, and a concise summary."
+    )
+    chat = client.chats.create(model="gemini-2.5-pro-exp-03-25")
+    resp = chat.send_message(prompt)
+    time.sleep(7)
+    report = resp.text
+    # cache it
+    doc_ref.set({
+        "business_id":   biz,
+        "start_date":    sd,
+        "end_date":      ed,
+        "statement_type": stype,
+        "report":         report,
+        "created_at":     firestore.SERVER_TIMESTAMP
+    })
+    return jsonify({"report": report, "cached": False}), 200
+# --------- Run the App ---------
 if __name__ == "__main__":
     app.run(host="0.0.0.0", port=7860, debug=True)