Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -1,16 +1,20 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import numpy as np
|
|
|
|
| 5 |
import pickle
|
| 6 |
from flask import Flask, request, jsonify
|
| 7 |
from flask_cors import CORS
|
| 8 |
-
import firebase_admin
|
| 9 |
-
from firebase_admin import credentials, firestore
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
|
|
|
|
| 12 |
from google import genai
|
| 13 |
-
from google.genai import types
|
| 14 |
|
| 15 |
load_dotenv()
|
| 16 |
|
|
@@ -18,19 +22,26 @@ load_dotenv()
|
|
| 18 |
app = Flask(__name__)
|
| 19 |
CORS(app)
|
| 20 |
|
|
|
|
| 21 |
cred_json = os.environ.get("FIREBASE")
|
| 22 |
-
if cred_json:
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
# --------- Google GenAI Client ---------
|
| 28 |
client = genai.Client(api_key=os.getenv("Gemini"))
|
| 29 |
|
| 30 |
-
# --------- FAISS Cache Paths ---------
|
| 31 |
INDEX_PATH = "vector.index"
|
| 32 |
DOCS_PATH = "documents.pkl"
|
| 33 |
|
|
|
|
|
|
|
| 34 |
# --------- Fetch & Summarize Firestore Docs ---------
|
| 35 |
def fetch_documents() -> list[str]:
|
| 36 |
docs: list[str] = []
|
|
@@ -142,7 +153,82 @@ def retrieve_and_respond(user_query: str, top_k: int = 3) -> str:
|
|
| 142 |
resp = chat.send_message(prompt)
|
| 143 |
return resp.text
|
| 144 |
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
@app.route("/chat", methods=["POST"])
|
| 147 |
def chat_endpoint():
|
| 148 |
data = request.get_json(force=True)
|
|
@@ -154,5 +240,153 @@ def chat_endpoint():
|
|
| 154 |
except Exception as e:
|
| 155 |
return jsonify({"error": str(e)}), 500
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
if __name__ == "__main__":
|
| 158 |
app.run(host="0.0.0.0", port=7860, debug=True)
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
+
import time
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
from io import BytesIO
|
| 6 |
+
|
| 7 |
+
import pypdf
|
| 8 |
+
import firebase_admin
|
| 9 |
import numpy as np
|
| 10 |
+
import faiss
|
| 11 |
import pickle
|
| 12 |
from flask import Flask, request, jsonify
|
| 13 |
from flask_cors import CORS
|
|
|
|
|
|
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
|
| 16 |
+
from firebase_admin import credentials, firestore, storage
|
| 17 |
from google import genai
|
|
|
|
| 18 |
|
| 19 |
load_dotenv()
|
| 20 |
|
|
|
|
| 22 |
app = Flask(__name__)
|
| 23 |
CORS(app)
|
| 24 |
|
| 25 |
+
# Initialize Firebase with Firestore + Storage
|
| 26 |
cred_json = os.environ.get("FIREBASE")
|
| 27 |
+
if not cred_json:
|
| 28 |
+
raise RuntimeError("Missing FIREBASE env var")
|
| 29 |
+
cred = credentials.Certificate(json.loads(cred_json))
|
| 30 |
+
firebase_admin.initialize_app(cred, {
|
| 31 |
+
"storageBucket": os.environ.get("FIREBASE_STORAGE_BUCKET")
|
| 32 |
+
})
|
| 33 |
+
fs = firestore.client()
|
| 34 |
+
bucket = storage.bucket()
|
| 35 |
|
| 36 |
# --------- Google GenAI Client ---------
|
| 37 |
client = genai.Client(api_key=os.getenv("Gemini"))
|
| 38 |
|
| 39 |
+
# --------- FAISS Cache Paths (unchanged) ---------
|
| 40 |
INDEX_PATH = "vector.index"
|
| 41 |
DOCS_PATH = "documents.pkl"
|
| 42 |
|
| 43 |
+
|
| 44 |
+
|
| 45 |
# --------- Fetch & Summarize Firestore Docs ---------
|
| 46 |
def fetch_documents() -> list[str]:
|
| 47 |
docs: list[str] = []
|
|
|
|
| 153 |
resp = chat.send_message(prompt)
|
| 154 |
return resp.text
|
| 155 |
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
# --------- Helpers for Bank-Statement Processing ---------
|
| 159 |
+
|
| 160 |
+
def read_pdf_pages(file_obj):
|
| 161 |
+
file_obj.seek(0)
|
| 162 |
+
reader = pypdf.PdfReader(file_obj)
|
| 163 |
+
return reader, len(reader.pages)
|
| 164 |
+
|
| 165 |
+
def extract_page_text(reader, page_num):
|
| 166 |
+
if page_num < len(reader.pages):
|
| 167 |
+
return reader.pages[page_num].extract_text() or ""
|
| 168 |
+
return ""
|
| 169 |
+
|
| 170 |
+
def process_with_gemini(model, text):
|
| 171 |
+
prompt = """Analyze this bank statement and extract transactions in JSON format with these fields:
|
| 172 |
+
- Date (format DD/MM/YYYY)
|
| 173 |
+
- Description
|
| 174 |
+
- Amount (just the integer value)
|
| 175 |
+
- Type (is 'income' if 'credit amount', else 'expense')
|
| 176 |
+
- Customer Name (Only If Type is 'income' and if no name is extracted write 'general income' and if type is not 'income' write 'expense')
|
| 177 |
+
- City (In address of bank statement)
|
| 178 |
+
- Category_of_expense (a string, if transaction 'Type' is 'expense' categorize it based on description into: Water and electricity, Salaries and wages, Repairs & Maintenance, Motor vehicle expenses, Projects Expenses, Hardware expenses, Refunds, Accounting fees, Loan interest, Bank charges, Insurance, SARS PAYE UIF, Advertising & Marketing, Logistics and distribution, Fuel, Website hosting fees, Rentals, Subscriptions, Computer internet and Telephone, Staff training, Travel and accommodation, Depreciation, Other expenses. If no category matches, default to 'Other expenses'. If 'Type' is 'income' set Destination_of_funds to 'income'.)
|
| 179 |
+
- ignore opening or closing balances, charts and analysis.
|
| 180 |
+
|
| 181 |
+
Return ONLY valid JSON with this structure:
|
| 182 |
+
{
|
| 183 |
+
"transactions": [
|
| 184 |
+
{
|
| 185 |
+
"Date": "string",
|
| 186 |
+
"Description": "string",
|
| 187 |
+
"Customer_name": "string",
|
| 188 |
+
"City": "string",
|
| 189 |
+
"Amount": number,
|
| 190 |
+
"Type": "string",
|
| 191 |
+
"Category_of_expense": "string"
|
| 192 |
+
}
|
| 193 |
+
]
|
| 194 |
+
}"""
|
| 195 |
+
try:
|
| 196 |
+
resp = model.generate_content([prompt, text])
|
| 197 |
+
time.sleep(6) # match your Streamlit rate-limit workaround
|
| 198 |
+
return resp.text
|
| 199 |
+
except Exception as e:
|
| 200 |
+
# retry once on 504
|
| 201 |
+
if hasattr(e, "response") and getattr(e.response, "status_code", None) == 504:
|
| 202 |
+
time.sleep(6)
|
| 203 |
+
resp = model.generate_content([prompt, text])
|
| 204 |
+
return resp.text
|
| 205 |
+
raise
|
| 206 |
+
|
| 207 |
+
def process_pdf_pages(model, pdf_file):
|
| 208 |
+
reader, total_pages = read_pdf_pages(pdf_file)
|
| 209 |
+
all_txns = []
|
| 210 |
+
for pg in range(total_pages):
|
| 211 |
+
txt = extract_page_text(reader, pg).strip()
|
| 212 |
+
if not txt:
|
| 213 |
+
continue
|
| 214 |
+
try:
|
| 215 |
+
raw = process_with_gemini(model, txt)
|
| 216 |
+
except Exception:
|
| 217 |
+
continue
|
| 218 |
+
# grab the JSON blob
|
| 219 |
+
start = raw.find("{")
|
| 220 |
+
end = raw.rfind("}") + 1
|
| 221 |
+
if start < 0 or end <= 0:
|
| 222 |
+
continue
|
| 223 |
+
js = raw[start:end].replace("```json", "").replace("```", "")
|
| 224 |
+
try:
|
| 225 |
+
data = json.loads(js)
|
| 226 |
+
all_txns.extend(data.get("transactions", []))
|
| 227 |
+
except json.JSONDecodeError:
|
| 228 |
+
continue
|
| 229 |
+
return all_txns
|
| 230 |
+
|
| 231 |
+
# --------- Chat Endpoint ---------
|
| 232 |
@app.route("/chat", methods=["POST"])
|
| 233 |
def chat_endpoint():
|
| 234 |
data = request.get_json(force=True)
|
|
|
|
| 240 |
except Exception as e:
|
| 241 |
return jsonify({"error": str(e)}), 500
|
| 242 |
|
| 243 |
+
# --------- Endpoint: Upload & Store Bank Statements ---------
|
| 244 |
+
|
| 245 |
+
@app.route("/upload_statements", methods=["POST"])
|
| 246 |
+
def upload_statements():
|
| 247 |
+
"""
|
| 248 |
+
Expects multipart/form-data:
|
| 249 |
+
- 'business_id': string
|
| 250 |
+
- 'files': one or more PDFs
|
| 251 |
+
Stores each PDF in Storage, extracts transactions, and writes them
|
| 252 |
+
to Firestore (collection 'transactions') with a 'business_id' tag.
|
| 253 |
+
"""
|
| 254 |
+
business_id = request.form.get("business_id", "").strip()
|
| 255 |
+
if not business_id:
|
| 256 |
+
return jsonify({"error": "Missing business_id"}), 400
|
| 257 |
+
|
| 258 |
+
if "files" not in request.files:
|
| 259 |
+
return jsonify({"error": "No files part; upload under key 'files'"}), 400
|
| 260 |
+
|
| 261 |
+
files = request.files.getlist("files")
|
| 262 |
+
if not files:
|
| 263 |
+
return jsonify({"error": "No files uploaded"}), 400
|
| 264 |
+
|
| 265 |
+
# configure exactly as in your Streamlit code
|
| 266 |
+
model = genai.GenerativeModel("gemini-2.0-flash-thinking-exp")
|
| 267 |
+
stored_count = 0
|
| 268 |
+
|
| 269 |
+
for f in files:
|
| 270 |
+
filename = f.filename or "statement.pdf"
|
| 271 |
+
# upload raw PDF to storage
|
| 272 |
+
dest_path = f"{business_id}/bank_statements/{datetime.utcnow().isoformat()}_{filename}"
|
| 273 |
+
blob = bucket.blob(dest_path)
|
| 274 |
+
f.seek(0)
|
| 275 |
+
blob.upload_from_file(f, content_type=f.content_type)
|
| 276 |
+
# rewind for processing
|
| 277 |
+
f.seek(0)
|
| 278 |
+
|
| 279 |
+
# extract + store transactions
|
| 280 |
+
txns = process_pdf_pages(model, f)
|
| 281 |
+
for txn in txns:
|
| 282 |
+
try:
|
| 283 |
+
dt = datetime.strptime(txn["Date"], "%d/%m/%Y")
|
| 284 |
+
except Exception:
|
| 285 |
+
dt = datetime.utcnow()
|
| 286 |
+
record = {
|
| 287 |
+
"business_id": business_id,
|
| 288 |
+
"Date": firestore.Timestamp.from_datetime(dt),
|
| 289 |
+
"Description": txn.get("Description", ""),
|
| 290 |
+
"Amount": txn.get("Amount", 0),
|
| 291 |
+
"Type": txn.get("Type", "expense"),
|
| 292 |
+
"Customer_name": txn.get("Customer_name",
|
| 293 |
+
"general income" if txn.get("Type")=="income" else "expense"),
|
| 294 |
+
"City": txn.get("City", ""),
|
| 295 |
+
"Category_of_expense": txn.get("Category_of_expense", "")
|
| 296 |
+
}
|
| 297 |
+
fs.collection("transactions").add(record)
|
| 298 |
+
stored_count += 1
|
| 299 |
+
|
| 300 |
+
return jsonify({"message": f"Stored {stored_count} transactions"}), 200
|
| 301 |
+
|
| 302 |
+
# --------- Endpoint: Retrieve or Generate Financial Statement ---------
|
| 303 |
+
|
| 304 |
+
@app.route("/financial_statement", methods=["POST"])
|
| 305 |
+
def financial_statement():
|
| 306 |
+
"""
|
| 307 |
+
Expects JSON:
|
| 308 |
+
{
|
| 309 |
+
"business_id": "...",
|
| 310 |
+
"start_date": "YYYY-MM-DD",
|
| 311 |
+
"end_date": "YYYY-MM-DD",
|
| 312 |
+
"statement_type": "Income Statement"|"Cashflow Statement"|"Balance Sheet"
|
| 313 |
+
}
|
| 314 |
+
If a cached report exists for that exact (business_id, start,end), returns it.
|
| 315 |
+
Otherwise generates via Gemini, returns it, and caches it in Firestore.
|
| 316 |
+
"""
|
| 317 |
+
data = request.get_json(force=True) or {}
|
| 318 |
+
biz = data.get("business_id", "").strip()
|
| 319 |
+
sd = data.get("start_date", "")
|
| 320 |
+
ed = data.get("end_date", "")
|
| 321 |
+
stype = data.get("statement_type", "Income Statement")
|
| 322 |
+
|
| 323 |
+
if not (biz and sd and ed):
|
| 324 |
+
return jsonify({"error": "Missing one of business_id, start_date, end_date"}), 400
|
| 325 |
+
|
| 326 |
+
# parse iso dates
|
| 327 |
+
try:
|
| 328 |
+
dt_start = datetime.fromisoformat(sd)
|
| 329 |
+
dt_end = datetime.fromisoformat(ed)
|
| 330 |
+
except ValueError:
|
| 331 |
+
return jsonify({"error": "Dates must be YYYY-MM-DD"}), 400
|
| 332 |
+
|
| 333 |
+
# check cache
|
| 334 |
+
doc_id = f"{biz}__{sd}__{ed}__{stype.replace(' ','_')}"
|
| 335 |
+
doc_ref = fs.collection("financial_statements").document(doc_id)
|
| 336 |
+
cached = doc_ref.get()
|
| 337 |
+
if cached.exists:
|
| 338 |
+
return jsonify({"report": cached.to_dict()["report"], "cached": True}), 200
|
| 339 |
+
|
| 340 |
+
# fetch transactions
|
| 341 |
+
snaps = (
|
| 342 |
+
fs.collection("transactions")
|
| 343 |
+
.where("business_id", "==", biz)
|
| 344 |
+
.where("Date", ">=", firestore.Timestamp.from_datetime(dt_start))
|
| 345 |
+
.where("Date", "<=", firestore.Timestamp.from_datetime(dt_end))
|
| 346 |
+
.stream()
|
| 347 |
+
)
|
| 348 |
+
txns = []
|
| 349 |
+
for s in snaps:
|
| 350 |
+
d = s.to_dict()
|
| 351 |
+
txns.append({
|
| 352 |
+
"Date": d["Date"].to_datetime().strftime("%d/%m/%Y"),
|
| 353 |
+
"Description": d.get("Description",""),
|
| 354 |
+
"Amount": d.get("Amount",0),
|
| 355 |
+
"Type": d.get("Type",""),
|
| 356 |
+
"Customer_name": d.get("Customer_name",""),
|
| 357 |
+
"City": d.get("City",""),
|
| 358 |
+
"Category_of_expense": d.get("Category_of_expense","")
|
| 359 |
+
})
|
| 360 |
+
|
| 361 |
+
if not txns:
|
| 362 |
+
return jsonify({"error": "No transactions found for that period"}), 404
|
| 363 |
+
|
| 364 |
+
# generate with Gemini
|
| 365 |
+
prompt = (
|
| 366 |
+
f"Based on the following transactions JSON data:\n"
|
| 367 |
+
f"{json.dumps({'transactions': txns})}\n"
|
| 368 |
+
f"Generate a detailed {stype} for the period from "
|
| 369 |
+
f"{dt_start.strftime('%d/%m/%Y')} to {dt_end.strftime('%d/%m/%Y')} "
|
| 370 |
+
f"in Markdown, following standard South African accounting practice, with headings, "
|
| 371 |
+
"subtotals, totals, key highlights, and a concise summary."
|
| 372 |
+
)
|
| 373 |
+
chat = client.chats.create(model="gemini-2.5-pro-exp-03-25")
|
| 374 |
+
resp = chat.send_message(prompt)
|
| 375 |
+
time.sleep(7)
|
| 376 |
+
report = resp.text
|
| 377 |
+
|
| 378 |
+
# cache it
|
| 379 |
+
doc_ref.set({
|
| 380 |
+
"business_id": biz,
|
| 381 |
+
"start_date": sd,
|
| 382 |
+
"end_date": ed,
|
| 383 |
+
"statement_type": stype,
|
| 384 |
+
"report": report,
|
| 385 |
+
"created_at": firestore.SERVER_TIMESTAMP
|
| 386 |
+
})
|
| 387 |
+
|
| 388 |
+
return jsonify({"report": report, "cached": False}), 200
|
| 389 |
+
|
| 390 |
+
# --------- Run the App ---------
|
| 391 |
if __name__ == "__main__":
|
| 392 |
app.run(host="0.0.0.0", port=7860, debug=True)
|