rairo commited on
Commit
ac4c899
·
verified ·
1 Parent(s): 4c92169

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +244 -10
main.py CHANGED
@@ -1,16 +1,20 @@
1
  import os
2
  import json
3
- import faiss
 
 
 
 
 
4
  import numpy as np
 
5
  import pickle
6
  from flask import Flask, request, jsonify
7
  from flask_cors import CORS
8
- import firebase_admin
9
- from firebase_admin import credentials, firestore
10
  from dotenv import load_dotenv
11
 
 
12
  from google import genai
13
- from google.genai import types
14
 
15
  load_dotenv()
16
 
@@ -18,19 +22,26 @@ load_dotenv()
18
  app = Flask(__name__)
19
  CORS(app)
20
 
 
21
  cred_json = os.environ.get("FIREBASE")
22
- if cred_json:
23
- cred = credentials.Certificate(json.loads(cred_json))
24
- firebase_admin.initialize_app(cred)
25
- fs = firestore.client()
 
 
 
 
26
 
27
  # --------- Google GenAI Client ---------
28
  client = genai.Client(api_key=os.getenv("Gemini"))
29
 
30
- # --------- FAISS Cache Paths ---------
31
  INDEX_PATH = "vector.index"
32
  DOCS_PATH = "documents.pkl"
33
 
 
 
34
  # --------- Fetch & Summarize Firestore Docs ---------
35
  def fetch_documents() -> list[str]:
36
  docs: list[str] = []
@@ -142,7 +153,82 @@ def retrieve_and_respond(user_query: str, top_k: int = 3) -> str:
142
  resp = chat.send_message(prompt)
143
  return resp.text
144
 
145
- # --------- Flask Endpoint ---------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  @app.route("/chat", methods=["POST"])
147
  def chat_endpoint():
148
  data = request.get_json(force=True)
@@ -154,5 +240,153 @@ def chat_endpoint():
154
  except Exception as e:
155
  return jsonify({"error": str(e)}), 500
156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
157
  if __name__ == "__main__":
158
  app.run(host="0.0.0.0", port=7860, debug=True)
 
1
  import os
2
  import json
3
+ import time
4
+ from datetime import datetime
5
+ from io import BytesIO
6
+
7
+ import pypdf
8
+ import firebase_admin
9
  import numpy as np
10
+ import faiss
11
  import pickle
12
  from flask import Flask, request, jsonify
13
  from flask_cors import CORS
 
 
14
  from dotenv import load_dotenv
15
 
16
+ from firebase_admin import credentials, firestore, storage
17
  from google import genai
 
18
 
19
  load_dotenv()
20
 
 
22
  app = Flask(__name__)
23
  CORS(app)
24
 
25
+ # Initialize Firebase with Firestore + Storage
26
  cred_json = os.environ.get("FIREBASE")
27
+ if not cred_json:
28
+ raise RuntimeError("Missing FIREBASE env var")
29
+ cred = credentials.Certificate(json.loads(cred_json))
30
+ firebase_admin.initialize_app(cred, {
31
+ "storageBucket": os.environ.get("FIREBASE_STORAGE_BUCKET")
32
+ })
33
+ fs = firestore.client()
34
+ bucket = storage.bucket()
35
 
36
  # --------- Google GenAI Client ---------
37
  client = genai.Client(api_key=os.getenv("Gemini"))
38
 
39
+ # --------- FAISS Cache Paths (unchanged) ---------
40
  INDEX_PATH = "vector.index"
41
  DOCS_PATH = "documents.pkl"
42
 
43
+
44
+
45
  # --------- Fetch & Summarize Firestore Docs ---------
46
  def fetch_documents() -> list[str]:
47
  docs: list[str] = []
 
153
  resp = chat.send_message(prompt)
154
  return resp.text
155
 
156
+
157
+
158
+ # --------- Helpers for Bank-Statement Processing ---------
159
+
160
+ def read_pdf_pages(file_obj):
161
+ file_obj.seek(0)
162
+ reader = pypdf.PdfReader(file_obj)
163
+ return reader, len(reader.pages)
164
+
165
+ def extract_page_text(reader, page_num):
166
+ if page_num < len(reader.pages):
167
+ return reader.pages[page_num].extract_text() or ""
168
+ return ""
169
+
170
+ def process_with_gemini(model, text):
171
+ prompt = """Analyze this bank statement and extract transactions in JSON format with these fields:
172
+ - Date (format DD/MM/YYYY)
173
+ - Description
174
+ - Amount (just the integer value)
175
+ - Type (is 'income' if 'credit amount', else 'expense')
176
+ - Customer Name (Only If Type is 'income' and if no name is extracted write 'general income' and if type is not 'income' write 'expense')
177
+ - City (In address of bank statement)
178
+ - Category_of_expense (a string, if transaction 'Type' is 'expense' categorize it based on description into: Water and electricity, Salaries and wages, Repairs & Maintenance, Motor vehicle expenses, Projects Expenses, Hardware expenses, Refunds, Accounting fees, Loan interest, Bank charges, Insurance, SARS PAYE UIF, Advertising & Marketing, Logistics and distribution, Fuel, Website hosting fees, Rentals, Subscriptions, Computer internet and Telephone, Staff training, Travel and accommodation, Depreciation, Other expenses. If no category matches, default to 'Other expenses'. If 'Type' is 'income' set Destination_of_funds to 'income'.)
179
+ - ignore opening or closing balances, charts and analysis.
180
+
181
+ Return ONLY valid JSON with this structure:
182
+ {
183
+ "transactions": [
184
+ {
185
+ "Date": "string",
186
+ "Description": "string",
187
+ "Customer_name": "string",
188
+ "City": "string",
189
+ "Amount": number,
190
+ "Type": "string",
191
+ "Category_of_expense": "string"
192
+ }
193
+ ]
194
+ }"""
195
+ try:
196
+ resp = model.generate_content([prompt, text])
197
+ time.sleep(6) # match your Streamlit rate-limit workaround
198
+ return resp.text
199
+ except Exception as e:
200
+ # retry once on 504
201
+ if hasattr(e, "response") and getattr(e.response, "status_code", None) == 504:
202
+ time.sleep(6)
203
+ resp = model.generate_content([prompt, text])
204
+ return resp.text
205
+ raise
206
+
207
+ def process_pdf_pages(model, pdf_file):
208
+ reader, total_pages = read_pdf_pages(pdf_file)
209
+ all_txns = []
210
+ for pg in range(total_pages):
211
+ txt = extract_page_text(reader, pg).strip()
212
+ if not txt:
213
+ continue
214
+ try:
215
+ raw = process_with_gemini(model, txt)
216
+ except Exception:
217
+ continue
218
+ # grab the JSON blob
219
+ start = raw.find("{")
220
+ end = raw.rfind("}") + 1
221
+ if start < 0 or end <= 0:
222
+ continue
223
+ js = raw[start:end].replace("```json", "").replace("```", "")
224
+ try:
225
+ data = json.loads(js)
226
+ all_txns.extend(data.get("transactions", []))
227
+ except json.JSONDecodeError:
228
+ continue
229
+ return all_txns
230
+
231
+ # --------- Chat Endpoint ---------
232
  @app.route("/chat", methods=["POST"])
233
  def chat_endpoint():
234
  data = request.get_json(force=True)
 
240
  except Exception as e:
241
  return jsonify({"error": str(e)}), 500
242
 
243
+ # --------- Endpoint: Upload & Store Bank Statements ---------
244
+
245
+ @app.route("/upload_statements", methods=["POST"])
246
+ def upload_statements():
247
+ """
248
+ Expects multipart/form-data:
249
+ - 'business_id': string
250
+ - 'files': one or more PDFs
251
+ Stores each PDF in Storage, extracts transactions, and writes them
252
+ to Firestore (collection 'transactions') with a 'business_id' tag.
253
+ """
254
+ business_id = request.form.get("business_id", "").strip()
255
+ if not business_id:
256
+ return jsonify({"error": "Missing business_id"}), 400
257
+
258
+ if "files" not in request.files:
259
+ return jsonify({"error": "No files part; upload under key 'files'"}), 400
260
+
261
+ files = request.files.getlist("files")
262
+ if not files:
263
+ return jsonify({"error": "No files uploaded"}), 400
264
+
265
+ # configure exactly as in your Streamlit code
266
+ model = genai.GenerativeModel("gemini-2.0-flash-thinking-exp")
267
+ stored_count = 0
268
+
269
+ for f in files:
270
+ filename = f.filename or "statement.pdf"
271
+ # upload raw PDF to storage
272
+ dest_path = f"{business_id}/bank_statements/{datetime.utcnow().isoformat()}_{filename}"
273
+ blob = bucket.blob(dest_path)
274
+ f.seek(0)
275
+ blob.upload_from_file(f, content_type=f.content_type)
276
+ # rewind for processing
277
+ f.seek(0)
278
+
279
+ # extract + store transactions
280
+ txns = process_pdf_pages(model, f)
281
+ for txn in txns:
282
+ try:
283
+ dt = datetime.strptime(txn["Date"], "%d/%m/%Y")
284
+ except Exception:
285
+ dt = datetime.utcnow()
286
+ record = {
287
+ "business_id": business_id,
288
+ "Date": firestore.Timestamp.from_datetime(dt),
289
+ "Description": txn.get("Description", ""),
290
+ "Amount": txn.get("Amount", 0),
291
+ "Type": txn.get("Type", "expense"),
292
+ "Customer_name": txn.get("Customer_name",
293
+ "general income" if txn.get("Type")=="income" else "expense"),
294
+ "City": txn.get("City", ""),
295
+ "Category_of_expense": txn.get("Category_of_expense", "")
296
+ }
297
+ fs.collection("transactions").add(record)
298
+ stored_count += 1
299
+
300
+ return jsonify({"message": f"Stored {stored_count} transactions"}), 200
301
+
302
+ # --------- Endpoint: Retrieve or Generate Financial Statement ---------
303
+
304
+ @app.route("/financial_statement", methods=["POST"])
305
+ def financial_statement():
306
+ """
307
+ Expects JSON:
308
+ {
309
+ "business_id": "...",
310
+ "start_date": "YYYY-MM-DD",
311
+ "end_date": "YYYY-MM-DD",
312
+ "statement_type": "Income Statement"|"Cashflow Statement"|"Balance Sheet"
313
+ }
314
+ If a cached report exists for that exact (business_id, start,end), returns it.
315
+ Otherwise generates via Gemini, returns it, and caches it in Firestore.
316
+ """
317
+ data = request.get_json(force=True) or {}
318
+ biz = data.get("business_id", "").strip()
319
+ sd = data.get("start_date", "")
320
+ ed = data.get("end_date", "")
321
+ stype = data.get("statement_type", "Income Statement")
322
+
323
+ if not (biz and sd and ed):
324
+ return jsonify({"error": "Missing one of business_id, start_date, end_date"}), 400
325
+
326
+ # parse iso dates
327
+ try:
328
+ dt_start = datetime.fromisoformat(sd)
329
+ dt_end = datetime.fromisoformat(ed)
330
+ except ValueError:
331
+ return jsonify({"error": "Dates must be YYYY-MM-DD"}), 400
332
+
333
+ # check cache
334
+ doc_id = f"{biz}__{sd}__{ed}__{stype.replace(' ','_')}"
335
+ doc_ref = fs.collection("financial_statements").document(doc_id)
336
+ cached = doc_ref.get()
337
+ if cached.exists:
338
+ return jsonify({"report": cached.to_dict()["report"], "cached": True}), 200
339
+
340
+ # fetch transactions
341
+ snaps = (
342
+ fs.collection("transactions")
343
+ .where("business_id", "==", biz)
344
+ .where("Date", ">=", firestore.Timestamp.from_datetime(dt_start))
345
+ .where("Date", "<=", firestore.Timestamp.from_datetime(dt_end))
346
+ .stream()
347
+ )
348
+ txns = []
349
+ for s in snaps:
350
+ d = s.to_dict()
351
+ txns.append({
352
+ "Date": d["Date"].to_datetime().strftime("%d/%m/%Y"),
353
+ "Description": d.get("Description",""),
354
+ "Amount": d.get("Amount",0),
355
+ "Type": d.get("Type",""),
356
+ "Customer_name": d.get("Customer_name",""),
357
+ "City": d.get("City",""),
358
+ "Category_of_expense": d.get("Category_of_expense","")
359
+ })
360
+
361
+ if not txns:
362
+ return jsonify({"error": "No transactions found for that period"}), 404
363
+
364
+ # generate with Gemini
365
+ prompt = (
366
+ f"Based on the following transactions JSON data:\n"
367
+ f"{json.dumps({'transactions': txns})}\n"
368
+ f"Generate a detailed {stype} for the period from "
369
+ f"{dt_start.strftime('%d/%m/%Y')} to {dt_end.strftime('%d/%m/%Y')} "
370
+ f"in Markdown, following standard South African accounting practice, with headings, "
371
+ "subtotals, totals, key highlights, and a concise summary."
372
+ )
373
+ chat = client.chats.create(model="gemini-2.5-pro-exp-03-25")
374
+ resp = chat.send_message(prompt)
375
+ time.sleep(7)
376
+ report = resp.text
377
+
378
+ # cache it
379
+ doc_ref.set({
380
+ "business_id": biz,
381
+ "start_date": sd,
382
+ "end_date": ed,
383
+ "statement_type": stype,
384
+ "report": report,
385
+ "created_at": firestore.SERVER_TIMESTAMP
386
+ })
387
+
388
+ return jsonify({"report": report, "cached": False}), 200
389
+
390
+ # --------- Run the App ---------
391
  if __name__ == "__main__":
392
  app.run(host="0.0.0.0", port=7860, debug=True)