rairo commited on
Commit
78a1433
·
verified ·
1 Parent(s): c0b5ea1

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +366 -0
main.py ADDED
@@ -0,0 +1,366 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import logging
4
+ import re
5
+ import tempfile
6
+ import time
7
+ from datetime import datetime
8
+ from flask import Flask, request, jsonify
9
+ from flask_cors import CORS
10
+ import pandas as pd
11
+ import pypdf
12
+ import google.generativeai as genai
13
+
14
+ # Setup logging
15
+ logging.basicConfig(level=logging.INFO)
16
+
17
+ app = Flask(__name__)
18
+ CORS(app) # Enable CORS for all routes
19
+
20
+ # Get API key securely
21
+ api_key = os.getenv('Gemini')
22
+ if not api_key:
23
+ raise ValueError("Gemini API key not found in environment variables")
24
+
25
+ def configure_gemini(api_key):
26
+ """Configure Gemini AI model."""
27
+ try:
28
+ genai.configure(api_key=api_key)
29
+ return genai.GenerativeModel('gemini-2.0-flash')
30
+ except Exception as e:
31
+ logging.error(f"Error configuring Gemini: {str(e)}")
32
+ raise
33
+
34
+ def read_pdf_pages(file_path):
35
+ """Extract text from each page of a PDF file as list of strings."""
36
+ try:
37
+ pages = []
38
+ with open(file_path, 'rb') as f:
39
+ reader = pypdf.PdfReader(f)
40
+ for page in reader.pages:
41
+ text = page.extract_text() or ""
42
+ pages.append(text)
43
+ return pages
44
+ except Exception as e:
45
+ logging.error(f"Error reading PDF: {str(e)}")
46
+ raise
47
+
48
+ PROMPT = """Analyze this bank statement and extract transactions in JSON format with these fields:
49
+ - Date (format DD/MM/YYYY)
50
+ - Description
51
+ - Amount (just the integer value)
52
+ - Type (categorize into one of the following based on the transaction nature):
53
+ * 'income' - money received from customers, sales, services rendered
54
+ * 'expense' - operational costs, purchases, payments made
55
+ * 'asset' - purchase of equipment, property, vehicles, or other assets
56
+ * 'liability' - taking on debt, loans received, credit facilities
57
+ * 'equity' - owner investments, capital contributions, retained earnings transfers
58
+ * 'transfer' - money moved between own accounts, internal transfers
59
+ * 'investment' - securities purchases, investment account funding, portfolio additions
60
+ * 'loan_repayment' - paying back borrowed money, loan principal payments
61
+ * 'capital_injection' - owner or investor adding money to the business
62
+ - Customer Name (Only if Type is 'income' and if no name is extracted write 'general income'. For all other types, extract relevant party name or write 'N/A')
63
+ - City (In address of bank statement)
64
+ - Destination_of_funds (categorize based on Type and description):
65
+ * If 'expense': Water and electricity, Salaries and wages, Repairs & Maintenance, Motor vehicle expenses, Projects Expenses, Hardware expenses, Refunds, Accounting fees, Loan interest, Bank charges, Insurance, SARS PAYE UIF, Advertising & Marketing, Logistics and distribution, Fuel, Website hosting fees, Rentals, Subscriptions, Computer internet and Telephone, Staff training, Travel and accommodation, Depreciation, Other expenses
66
+ * If 'income': 'income'
67
+ * If 'asset': Equipment, Property, Vehicles, Technology, Furniture, Other assets
68
+ * If 'liability': Bank loan, Credit facility, Supplier credit, Other liabilities
69
+ * If 'equity': Owner investment, Retained earnings, Share capital, Other equity
70
+ * If 'transfer': Internal transfer
71
+ * If 'investment': Securities, Mutual funds, Fixed deposits, Other investments
72
+ * If 'loan_repayment': Loan repayment
73
+ * If 'capital_injection': Capital injection
74
+ - ignore opening or closing balances.
75
+ - extract the amount in full including decimals.
76
+ Return ONLY the raw JSON object, without any surrounding text, explanations, or markdown fences like ```json.
77
+
78
+ Return ONLY valid JSON with this structure:
79
+ {
80
+ "transactions": [
81
+ {
82
+ "Date": "string",
83
+ "Description": "string",
84
+ "Customer_name": "string",
85
+ "City": "string",
86
+ "Amount": number,
87
+ "Type": "string",
88
+ "Destination_of_funds": "string"
89
+ }
90
+ ]
91
+ }"""
92
+
93
+ def get_text_prompt_with_date():
94
+ """Generate TEXT_PROMPT with current date context."""
95
+ current_date = datetime.now().strftime("%d/%m/%Y")
96
+ return f"""IMPORTANT: Today's date is {current_date}. If the user does not specify a date for a transaction, use {current_date} as the default date.
97
+
98
+ Analyze the following natural language text and extract transactions in JSON format with these fields:
99
+ - Date (format DD/MM/YYYY) - USE {current_date} IF NO DATE IS SPECIFIED
100
+ - Description
101
+ - Amount (just the integer value)
102
+ - Type (categorize into one of the following based on the transaction nature):
103
+ * 'income' - money received from customers, sales, services rendered
104
+ * 'expense' - operational costs, purchases, payments made
105
+ * 'asset' - purchase of equipment, property, vehicles, or other assets
106
+ * 'liability' - taking on debt, loans received, credit facilities
107
+ * 'equity' - owner investments, capital contributions, retained earnings transfers
108
+ * 'transfer' - money moved between own accounts, internal transfers
109
+ * 'investment' - securities purchases, investment account funding, portfolio additions
110
+ * 'loan_repayment' - paying back borrowed money, loan principal payments
111
+ * 'capital_injection' - owner or investor adding money to the business
112
+ - Customer Name (Only if Type is 'income' and if no name is extracted write 'general income'. For all other types, extract relevant party name or write 'N/A')
113
+ - City (extract from any address information provided or write 'N/A' if not available)
114
+ - Destination_of_funds (categorize based on Type and description):
115
+ * If 'expense': Water and electricity, Salaries and wages, Repairs & Maintenance, Motor vehicle expenses, Projects Expenses, Hardware expenses, Refunds, Accounting fees, Loan interest, Bank charges, Insurance, SARS PAYE UIF, Advertising & Marketing, Logistics and distribution, Fuel, Website hosting fees, Rentals, Subscriptions, Computer internet and Telephone, Staff training, Travel and accommodation, Depreciation, Other expenses
116
+ * If 'income': 'income'
117
+ * If 'asset': Equipment, Property, Vehicles, Technology, Furniture, Other assets
118
+ * If 'liability': Bank loan, Credit facility, Supplier credit, Other liabilities
119
+ * If 'equity': Owner investment, Retained earnings, Share capital, Other equity
120
+ * If 'transfer': Internal transfer
121
+ * If 'investment': Securities, Mutual funds, Fixed deposits, Other investments
122
+ * If 'loan_repayment': Loan repayment
123
+ * If 'capital_injection': Capital injection
124
+ - ignore opening or closing balances.
125
+
126
+ Return ONLY valid JSON with this structure:
127
+ {{
128
+ "transactions": [
129
+ {{
130
+ "Date": "string",
131
+ "Description": "string",
132
+ "Customer_name": "string",
133
+ "City": "string",
134
+ "Amount": number,
135
+ "Type": "string",
136
+ "Destination_of_funds": "string"
137
+ }}
138
+ ]
139
+ }}
140
+ important: Return an empty array if no transactions are in the text. Do not make up false data.
141
+ """
142
+
143
+ def repair_json_with_gemini(model, broken_json_string):
144
+ """Makes a second API call to Gemini to fix a broken JSON string."""
145
+ logging.info("Attempting to repair broken JSON with another Gemini call...")
146
+ repair_prompt = f"""The following text is a JSON object that is syntactically incorrect.
147
+ It might have missing commas, brackets, or other errors.
148
+ Please fix the syntax to make it a valid JSON object.
149
+ Return ONLY the corrected, raw JSON object and nothing else.
150
+
151
+ Broken JSON:
152
+ {broken_json_string}
153
+ """
154
+ try:
155
+ resp = model.generate_content(repair_prompt)
156
+ return resp.text
157
+ except Exception as e:
158
+ logging.error(f"Error during JSON repair call: {e}")
159
+ raise ValueError("Failed to repair the JSON string.")
160
+
161
+ def call_gemini_with_retry_custom(model, text, prompt, retries=3, backoff_factor=2):
162
+ """Call Gemini with retries, now with JSON repair logic."""
163
+ for attempt in range(1, retries + 1):
164
+ try:
165
+ resp = model.generate_content([prompt, text])
166
+ response_text = resp.text
167
+ try:
168
+ # First attempt to parse the original response
169
+ return extract_json_from_response(response_text)
170
+ except Exception:
171
+ # If parsing fails, trigger the repair process
172
+ logging.warning("Initial JSON parsing failed. Attempting repair.")
173
+ repaired_text = repair_json_with_gemini(model, response_text)
174
+ return extract_json_from_response(repaired_text) # Parse the repaired text
175
+ except Exception as e:
176
+ msg = str(e)
177
+ if '429' in msg or 'RateLimit' in msg and attempt < retries:
178
+ wait = backoff_factor ** attempt
179
+ logging.warning(f"Rate limit hit, retrying in {wait}s (attempt {attempt}/{retries})")
180
+ time.sleep(wait)
181
+ else:
182
+ logging.error(f"Error processing with Gemini after retries: {msg}")
183
+ raise
184
+
185
+ def call_gemini_with_retry(model, text, retries=3, backoff_factor=2):
186
+ """Call Gemini with retries, now with JSON repair logic."""
187
+ # This function now simply calls the custom one with the default PROMPT
188
+ return call_gemini_with_retry_custom(model, text, PROMPT, retries, backoff_factor)
189
+
190
+ def extract_json_from_response(response_text):
191
+ """Extract valid JSON from Gemini's response, even if it's embedded."""
192
+ # First, try to find a JSON block enclosed in markdown fences
193
+ match = re.search(r'```json\s*(\{.*?\})\s*```', response_text, re.DOTALL)
194
+ if match:
195
+ json_string = match.group(1)
196
+ else:
197
+ # If no markdown, find the first '{' and the last '}'
198
+ match = re.search(r'{.*}', response_text, re.DOTALL)
199
+ if not match:
200
+ raise ValueError("No valid JSON object found in the LLM response")
201
+ json_string = match.group(0)
202
+
203
+ try:
204
+ # Clean and load the extracted JSON string
205
+ return json.loads(json_string)
206
+ except json.JSONDecodeError as e:
207
+ logging.error(f"Failed to parse extracted JSON. Error: {e}")
208
+ logging.error(f"Problematic JSON string was: {json_string}")
209
+ raise ValueError(f"Could not parse JSON from LLM response: {e}")
210
+
211
+ @app.route('/process-pdf', methods=['POST'])
212
+ def process_pdf():
213
+ """Handle PDF upload, process it in page-chunks with Gemini, and aggregate results."""
214
+ try:
215
+ if 'file' not in request.files:
216
+ return jsonify({'error': 'No file uploaded'}), 400
217
+ file = request.files['file']
218
+ if file.filename == '' or not file.filename.lower().endswith('.pdf'):
219
+ return jsonify({'error': 'A valid PDF file must be uploaded'}), 400
220
+
221
+ # Save to temp file
222
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
223
+ file.save(tmp.name)
224
+ file_path = tmp.name
225
+
226
+ try:
227
+ model = configure_gemini(api_key)
228
+ pages = read_pdf_pages(file_path)
229
+ all_transactions = []
230
+
231
+ for idx, page_text in enumerate(pages, start=1):
232
+ if not page_text.strip():
233
+ continue
234
+ logging.info(f"Processing page {idx}/{len(pages)}")
235
+ result = call_gemini_with_retry(model, page_text)
236
+ txs = result.get('transactions', [])
237
+ all_transactions.extend(txs)
238
+
239
+ return jsonify({'transactions': all_transactions})
240
+ finally:
241
+ os.remove(file_path)
242
+
243
+ except ValueError as ve:
244
+ logging.warning(f"Client error: {ve}")
245
+ return jsonify({'error': str(ve)}), 400
246
+ except Exception as e:
247
+ logging.error(f"Internal server error: {e}")
248
+ return jsonify({'error': 'Internal server error'}), 500
249
+
250
+ @app.route('/process-text', methods=['POST'])
251
+ def process_text():
252
+ """Handle text input and extract transactions using Gemini."""
253
+ try:
254
+ # Get JSON data from request
255
+ data = request.get_json()
256
+ if not data or 'text' not in data:
257
+ return jsonify({'error': 'No text provided'}), 400
258
+
259
+ text_input = data['text']
260
+
261
+ if not text_input.strip():
262
+ return jsonify({'error': 'Text input cannot be empty'}), 400
263
+
264
+ # Configure Gemini model
265
+ model = configure_gemini(api_key)
266
+
267
+ # Generate prompt with current date
268
+ text_prompt = get_text_prompt_with_date()
269
+
270
+ # Process the text with Gemini
271
+ logging.info("Processing text input for transaction extraction")
272
+ result = call_gemini_with_retry_custom(model, text_input, text_prompt)
273
+
274
+ transactions = result.get('transactions', [])
275
+
276
+ return jsonify({'transactions': transactions})
277
+
278
+ except ValueError as ve:
279
+ logging.warning(f"Client error: {ve}")
280
+ return jsonify({'error': str(ve)}), 400
281
+ except Exception as e:
282
+ logging.error(f"Internal server error: {e}")
283
+ return jsonify({'error': 'Internal server error'}), 500
284
+
285
+ @app.route('/transaction-types', methods=['GET'])
286
+ def get_transaction_types():
287
+ """Return available transaction types and their categories."""
288
+ transaction_types = {
289
+ "types": [
290
+ {
291
+ "type": "income",
292
+ "description": "Money received from customers, sales, services rendered",
293
+ "destination_categories": ["income"]
294
+ },
295
+ {
296
+ "type": "expense",
297
+ "description": "Operational costs, purchases, payments made",
298
+ "destination_categories": [
299
+ "Water and electricity", "Salaries and wages", "Repairs & Maintenance",
300
+ "Motor vehicle expenses", "Projects Expenses", "Hardware expenses",
301
+ "Refunds", "Accounting fees", "Loan interest", "Bank charges",
302
+ "Insurance", "SARS PAYE UIF", "Advertising & Marketing",
303
+ "Logistics and distribution", "Fuel", "Website hosting fees",
304
+ "Rentals", "Subscriptions", "Computer internet and Telephone",
305
+ "Staff training", "Travel and accommodation", "Depreciation",
306
+ "Other expenses"
307
+ ]
308
+ },
309
+ {
310
+ "type": "asset",
311
+ "description": "Purchase of equipment, property, vehicles, or other assets",
312
+ "destination_categories": [
313
+ "Equipment", "Property", "Vehicles", "Technology", "Furniture", "Other assets"
314
+ ]
315
+ },
316
+ {
317
+ "type": "liability",
318
+ "description": "Taking on debt, loans received, credit facilities",
319
+ "destination_categories": [
320
+ "Bank loan", "Credit facility", "Supplier credit", "Other liabilities"
321
+ ]
322
+ },
323
+ {
324
+ "type": "equity",
325
+ "description": "Owner investments, capital contributions, retained earnings transfers",
326
+ "destination_categories": [
327
+ "Owner investment", "Retained earnings", "Share capital", "Other equity"
328
+ ]
329
+ },
330
+ {
331
+ "type": "transfer",
332
+ "description": "Money moved between own accounts, internal transfers",
333
+ "destination_categories": ["Internal transfer"]
334
+ },
335
+ {
336
+ "type": "investment",
337
+ "description": "Securities purchases, investment account funding, portfolio additions",
338
+ "destination_categories": [
339
+ "Securities", "Mutual funds", "Fixed deposits", "Other investments"
340
+ ]
341
+ },
342
+ {
343
+ "type": "loan_repayment",
344
+ "description": "Paying back borrowed money, loan principal payments",
345
+ "destination_categories": ["Loan repayment"]
346
+ },
347
+ {
348
+ "type": "capital_injection",
349
+ "description": "Owner or investor adding money to the business",
350
+ "destination_categories": ["Capital injection"]
351
+ }
352
+ ]
353
+ }
354
+ return jsonify(transaction_types)
355
+
356
+ @app.route('/health', methods=['GET'])
357
+ def health_check():
358
+ """Health check endpoint."""
359
+ return jsonify({
360
+ 'status': 'healthy',
361
+ 'timestamp': datetime.now().isoformat(),
362
+ 'version': '2.0.0'
363
+ })
364
+
365
+ if __name__ == '__main__':
366
+ app.run(debug=True, host="0.0.0.0", port=7860)