rairo commited on
Commit
b61acc0
·
verified ·
1 Parent(s): ce82bf0

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +637 -110
main.py CHANGED
@@ -32,14 +32,12 @@ CORS(app)
32
  # Get API key securely
33
  api_key = os.getenv('Gemini')
34
  if not api_key:
35
- # Fallback for local testing if env var not set, though env var is preferred
36
  logging.warning("Gemini API key not found in environment variables.")
37
 
38
  def configure_gemini(api_key):
39
  """Configure Gemini AI model."""
40
  try:
41
  genai.configure(api_key=api_key)
42
- # Using 2.0 Flash as it has superior vision and long-context capabilities
43
  return genai.GenerativeModel('gemini-2.0-flash')
44
  except Exception as e:
45
  logging.error(f"Error configuring Gemini: {str(e)}")
@@ -49,8 +47,6 @@ def configure_gemini(api_key):
49
  # PROMPTS
50
  # -------------------------------------------------------------------------
51
 
52
- # Enhanced Prompt for General Financial Documents (Statements, Invoices, Receipts)
53
- # Addresses Point 1 (Rounding/Dates) & Point 3 (Document Types)
54
  FINANCIAL_DOC_PROMPT = """Analyze this financial document (which could be a Bank Statement, Invoice, Receipt, or Transaction List).
55
  Extract all relevant transactions/items in JSON format.
56
 
@@ -90,6 +86,49 @@ RETURN STRUCTURE:
90
  Return ONLY raw JSON. No markdown formatting.
91
  """
92
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  def get_text_prompt_with_fallback_date():
94
  """
95
  Generate prompt for raw text snippets where context might be missing.
@@ -103,46 +142,30 @@ If the text below does not specify a year or date, reasonable assume {current_da
103
  """
104
 
105
  # -------------------------------------------------------------------------
106
- # CATEGORIZATION LOGIC - TYPE-BASED (FIX FOR THE BUG)
107
  # -------------------------------------------------------------------------
108
 
109
  def categorize_transaction(transaction):
110
  """
111
  Categorizes a transaction based strictly on its Type field.
112
- This prevents keyword-based misclassification.
113
-
114
- Args:
115
- transaction: dict with keys including 'Type', 'Description', 'Destination_of_funds'
116
-
117
- Returns:
118
- dict with added 'Account_Category' field
119
  """
120
  tx_type = transaction.get('Type', '').lower()
121
  description = transaction.get('Description', '').lower()
122
  destination = transaction.get('Destination_of_funds', '').lower()
123
-
124
- # Add the categorized account field
125
  account_category = "Uncategorized"
126
-
127
- # ========== INCOME TYPE ==========
128
  if tx_type == 'income':
129
- # All income should map to revenue accounts, NOT expenses
130
  if any(keyword in description for keyword in ['sales', 'service', 'revenue', 'invoice']):
131
  account_category = "Sales Revenue"
132
  elif any(keyword in description for keyword in ['interest', 'dividend']):
133
  account_category = "Interest Income"
134
  elif any(keyword in description for keyword in ['transfer', 'deposit', 'payment']):
135
- # This fixes the "Income Trap" - transfers FROM others are income
136
  account_category = "Other Income"
137
  else:
138
  account_category = "Other Income"
139
-
140
- # ========== EXPENSE TYPE ==========
141
  elif tx_type == 'expense':
142
- # Map based on Destination_of_funds or description keywords
143
- # This is TYPE-FIRST, so "cash" in description won't make it an asset
144
-
145
- # Specific expense categories based on your system
146
  if 'salaries' in destination or 'wages' in destination or 'salary' in description:
147
  account_category = "Salaries and Wages"
148
  elif 'water' in destination or 'electricity' in destination:
@@ -177,21 +200,16 @@ def categorize_transaction(transaction):
177
  account_category = "Travel and Accommodation"
178
  elif 'depreciation' in destination:
179
  account_category = "Depreciation"
180
-
181
- # Special cases based on description (but still respecting expense type)
182
  elif 'atm' in description and 'cash' in description:
183
- # This fixes the "Cash Trap" - ATM withdrawals are drawings, not assets
184
  account_category = "Owner's Drawings"
185
  elif 'payment to' in description:
186
- # Payment to suppliers/vendors
187
  if any(word in description for word in ['fabric', 'printing', 'material']):
188
  account_category = "Cost of Sales"
189
  else:
190
  account_category = "Miscellaneous Expense"
191
  else:
192
  account_category = "Miscellaneous Expense"
193
-
194
- # ========== ASSET TYPE ==========
195
  elif tx_type == 'asset':
196
  if 'equipment' in destination or 'equipment' in description:
197
  account_category = "Equipment"
@@ -205,8 +223,7 @@ def categorize_transaction(transaction):
205
  account_category = "Furniture"
206
  else:
207
  account_category = "Other Assets"
208
-
209
- # ========== LIABILITY TYPE ==========
210
  elif tx_type == 'liability':
211
  if 'bank loan' in destination or 'loan' in description:
212
  account_category = "Bank Loan"
@@ -214,8 +231,7 @@ def categorize_transaction(transaction):
214
  account_category = "Credit Facility"
215
  else:
216
  account_category = "Other Liabilities"
217
-
218
- # ========== EQUITY TYPE ==========
219
  elif tx_type == 'equity':
220
  if 'owner' in destination or 'capital' in description:
221
  account_category = "Owner Investment"
@@ -223,12 +239,10 @@ def categorize_transaction(transaction):
223
  account_category = "Retained Earnings"
224
  else:
225
  account_category = "Other Equity"
226
-
227
- # ========== TRANSFER TYPE ==========
228
  elif tx_type == 'transfer':
229
  account_category = "Internal Transfer"
230
-
231
- # ========== INVESTMENT TYPE ==========
232
  elif tx_type == 'investment':
233
  if 'securities' in destination or 'stock' in description:
234
  account_category = "Securities"
@@ -236,16 +250,13 @@ def categorize_transaction(transaction):
236
  account_category = "Mutual Funds"
237
  else:
238
  account_category = "Other Investments"
239
-
240
- # ========== LOAN REPAYMENT TYPE ==========
241
  elif tx_type == 'loan_repayment':
242
  account_category = "Loan Repayment"
243
-
244
- # ========== CAPITAL INJECTION TYPE ==========
245
  elif tx_type == 'capital_injection':
246
  account_category = "Capital Injection"
247
-
248
- # Add the category to the transaction
249
  transaction['Account_Category'] = account_category
250
  return transaction
251
 
@@ -255,69 +266,60 @@ def categorize_transaction(transaction):
255
 
256
  def extract_json_from_response(response_text):
257
  """Extract valid JSON from Gemini's response, handling Markdown fences."""
258
- # Remove markdown code blocks
259
  cleaned_text = re.sub(r'```json\s*', '', response_text)
260
  cleaned_text = re.sub(r'```\s*', '', cleaned_text)
261
-
262
- # Find JSON object
263
  match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
264
  if match:
265
  json_string = match.group(1)
266
  else:
267
- # Fallback: assume the whole text is JSON
268
  json_string = cleaned_text
269
 
270
  try:
271
  return json.loads(json_string)
272
  except json.JSONDecodeError:
273
  logging.warning("JSON parsing failed, attempting repair.")
274
- raise ValueError(json_string) # Pass invalid string to caller for repair
275
 
276
- def repair_json_with_gemini(model, broken_json_string):
277
  """Uses Gemini to fix broken JSON syntax."""
278
  repair_prompt = f"""Fix this broken JSON string. Return ONLY valid JSON.
279
- Broken JSON: {broken_json_string}"""
280
  try:
281
  resp = model.generate_content(repair_prompt)
282
  return extract_json_from_response(resp.text)
283
  except Exception as e:
284
  logging.error(f"JSON repair failed: {e}")
285
- return {"transactions": []} # Fail safe
286
 
287
  def call_gemini_with_retry(model, content, prompt, retries=2):
288
  """
289
- Generic runner for Gemini.
290
- Args:
291
- content: Can be a String (text) or a PIL.Image object (vision).
292
  """
293
  for attempt in range(retries + 1):
294
  try:
295
- # Gemini Python SDK handles [Prompt, Image] or [Prompt, Text] automatically
296
  response = model.generate_content([prompt, content])
297
-
298
  try:
299
  result = extract_json_from_response(response.text)
300
-
301
- # POST-PROCESSING: Categorize each transaction based on Type
302
  if 'transactions' in result:
303
  result['transactions'] = [
304
  categorize_transaction(tx) for tx in result['transactions']
305
  ]
306
-
307
  return result
308
  except ValueError as ve:
309
- # Value error here contains the broken JSON string
310
  broken_json = str(ve)
311
- repaired = repair_json_with_gemini(model, broken_json)
312
-
313
- # Categorize repaired transactions too
314
  if 'transactions' in repaired:
315
  repaired['transactions'] = [
316
  categorize_transaction(tx) for tx in repaired['transactions']
317
  ]
318
-
319
  return repaired
320
-
321
  except Exception as e:
322
  if "429" in str(e) or "ResourceExhausted" in str(e):
323
  time.sleep(2 * (attempt + 1))
@@ -328,26 +330,391 @@ def call_gemini_with_retry(model, content, prompt, retries=2):
328
 
329
  return {"transactions": []}
330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
331
  def is_file_empty(file_path):
332
  """Check if file is empty."""
333
  return os.path.getsize(file_path) == 0
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  # -------------------------------------------------------------------------
336
  # CORE LOGIC: PDF PROCESSING (HYBRID TEXT + VISION)
337
  # -------------------------------------------------------------------------
338
 
339
  def process_pdf_page_as_image(model, pdf_path, page_num):
340
- """Point 4: Convert specific PDF page to image and process with Vision."""
341
  if not PDF_IMAGE_SUPPORT:
342
  raise ImportError("pdf2image/poppler not installed")
343
 
344
- # Convert specific page to image
345
- # first_page=page_num, last_page=page_num ensures we only convert 1 page at a time to save RAM
346
  images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
347
  if not images:
348
  return []
349
-
350
- # Process the image
351
  result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
352
  return result.get('transactions', [])
353
 
@@ -356,58 +723,48 @@ def process_pdf():
356
  """
357
  Smart PDF Processor:
358
  1. Checks if empty.
359
- 2. Tries standard Text extraction (Fast/Cheap).
360
- 3. If Text fails (Encryption) or is empty (Scanned), falls back to Vision (Slow/Powerful).
361
  """
362
  temp_path = None
363
  try:
364
- # 1. Validation
365
  if 'file' not in request.files:
366
  return jsonify({'error': 'No file uploaded'}), 400
367
  file = request.files['file']
368
  if file.filename == '':
369
  return jsonify({'error': 'No file selected'}), 400
370
 
371
- # Save Temp
372
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
373
  file.save(tmp.name)
374
  temp_path = tmp.name
375
 
376
- # Point 2: Empty File Check
377
  if is_file_empty(temp_path):
378
- return jsonify({'error': 'Uploaded file is empty'}), 400
379
 
380
  model = configure_gemini(api_key)
381
  all_transactions = []
382
-
383
- # Determine strategy: Try reading PDF structure first
384
  try:
385
  reader = pypdf.PdfReader(temp_path)
386
  num_pages = len(reader.pages)
387
-
388
  for i in range(num_pages):
389
  logging.info(f"Processing page {i+1}/{num_pages}")
390
-
391
- # Attempt Text Extraction
392
  try:
393
  text_content = reader.pages[i].extract_text()
394
  except Exception:
395
- text_content = "" # Force fallback if extraction fails
396
 
397
- # LOGIC: Check if text is sufficient. If < 50 chars, it's likely a scan or image-heavy.
398
  if text_content and len(text_content.strip()) > 50:
399
- # Strategy A: Text Mode
400
  logging.info("Text detected. Using Text Strategy.")
401
  result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT)
402
  else:
403
- # Strategy B: Vision Fallback (Point 4)
404
  logging.info("Low text/Encryption detected. Switching to Vision Strategy.")
405
  if PDF_IMAGE_SUPPORT:
406
- # Page numbers in pypdf are 0-indexed, pdf2image uses 1-based indexing often,
407
- # but convert_from_path handles slicing via first_page/last_page (1-based)
408
- txs = process_pdf_page_as_image(model, temp_path, i+1)
409
  all_transactions.extend(txs)
410
- continue # Skip the rest of loop
411
  else:
412
  logging.warning("Cannot process scanned PDF - pdf2image missing.")
413
  result = {"transactions": []}
@@ -416,10 +773,8 @@ def process_pdf():
416
  all_transactions.extend(txs)
417
 
418
  except pypdf.errors.PdfReadError:
419
- # If pypdf fails completely (e.g., highly corrupted or weird encryption), try Vision on whole file
420
  logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
421
  if PDF_IMAGE_SUPPORT:
422
- # Warning: Processing all pages as images might be slow
423
  images = convert_from_path(temp_path)
424
  for img in images:
425
  result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
@@ -437,7 +792,7 @@ def process_pdf():
437
  os.remove(temp_path)
438
 
439
  # -------------------------------------------------------------------------
440
- # TEXT & IMAGE ENDPOINTS (UPDATED)
441
  # -------------------------------------------------------------------------
442
 
443
  @app.route('/process-text', methods=['POST'])
@@ -447,18 +802,17 @@ def process_text():
447
  data = request.get_json()
448
  if not data or 'text' not in data:
449
  return jsonify({'error': 'No text provided'}), 400
450
-
451
  text_input = data['text']
452
  if not text_input.strip():
453
- return jsonify({'error': 'Text input cannot be empty'}), 400 # Point 2
454
-
455
  model = configure_gemini(api_key)
456
- # Use specific prompt with date fallback for raw text
457
  prompt = get_text_prompt_with_fallback_date()
458
-
459
  result = call_gemini_with_retry(model, text_input, prompt)
460
  return jsonify({'transactions': result.get('transactions', [])})
461
-
462
  except Exception as e:
463
  logging.error(f"Error: {e}")
464
  return jsonify({'error': str(e)}), 500
@@ -471,8 +825,7 @@ def process_image():
471
  if 'file' not in request.files:
472
  return jsonify({'error': 'No file uploaded'}), 400
473
  file = request.files['file']
474
-
475
- # Point 2: Empty check
476
  file.seek(0, os.SEEK_END)
477
  size = file.tell()
478
  file.seek(0)
@@ -484,15 +837,11 @@ def process_image():
484
  temp_path = tmp.name
485
 
486
  model = configure_gemini(api_key)
487
-
488
- # Load image with PIL
489
  img = Image.open(temp_path)
490
-
491
- # Use the General Financial Prompt
492
  result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
493
-
494
  return jsonify({'transactions': result.get('transactions', [])})
495
-
496
  except Exception as e:
497
  logging.error(f"Error: {e}")
498
  return jsonify({'error': str(e)}), 500
@@ -500,10 +849,189 @@ def process_image():
500
  if temp_path and os.path.exists(temp_path):
501
  os.remove(temp_path)
502
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
503
  @app.route('/transaction-types', methods=['GET'])
504
  def get_transaction_types():
505
  """Return available transaction types and their categories."""
506
- # Kept identical for backwards compatibility
507
  transaction_types = {
508
  "types": [
509
  {
@@ -577,10 +1105,9 @@ def health_check():
577
  return jsonify({
578
  'status': 'healthy',
579
  'timestamp': datetime.now().isoformat(),
580
- 'version': '2.2.0',
581
  'vision_support': PDF_IMAGE_SUPPORT
582
  })
583
 
584
  if __name__ == '__main__':
585
- # Ensure this port matches your server configuration
586
  app.run(debug=True, host="0.0.0.0", port=7860)
 
32
  # Get API key securely
33
  api_key = os.getenv('Gemini')
34
  if not api_key:
 
35
  logging.warning("Gemini API key not found in environment variables.")
36
 
37
  def configure_gemini(api_key):
38
  """Configure Gemini AI model."""
39
  try:
40
  genai.configure(api_key=api_key)
 
41
  return genai.GenerativeModel('gemini-2.0-flash')
42
  except Exception as e:
43
  logging.error(f"Error configuring Gemini: {str(e)}")
 
47
  # PROMPTS
48
  # -------------------------------------------------------------------------
49
 
 
 
50
  FINANCIAL_DOC_PROMPT = """Analyze this financial document (which could be a Bank Statement, Invoice, Receipt, or Transaction List).
51
  Extract all relevant transactions/items in JSON format.
52
 
 
86
  Return ONLY raw JSON. No markdown formatting.
87
  """
88
 
89
+ STUDENT_IMPORT_PROMPT = """Analyze this student document and extract student records into JSON.
90
+
91
+ The document may be:
92
+ - a class list
93
+ - an admission register
94
+ - a handwritten register
95
+ - a scanned student form
96
+ - a camera-captured document
97
+ - a PDF page
98
+ - an uploaded image
99
+
100
+ RULES:
101
+ 1. Return ONLY raw JSON. No markdown.
102
+ 2. Extract as many student rows as possible.
103
+ 3. Support both printed and handwritten text.
104
+ 4. If a field is missing, return an empty string.
105
+ 5. Do not invent students.
106
+ 6. Ignore page numbers, signatures, totals, decorations, and repeated headers.
107
+ 7. Normalize similar fields as follows:
108
+ - class / stream / class_name -> class_name
109
+ - grade / form / level -> grade
110
+ - admission number / admission no / reg no / student no -> admission_number
111
+ - phone / mobile / contact -> phone_number
112
+
113
+ RETURN STRUCTURE:
114
+ {
115
+ "students": [
116
+ {
117
+ "name": "Student Name",
118
+ "admission_number": "ADM-001",
119
+ "class_name": "A",
120
+ "grade": "Grade 7",
121
+ "gender": "Female",
122
+ "email": "student@example.com",
123
+ "phone_number": "+2637...",
124
+ "extra_fields": [
125
+ { "name": "guardian_name", "value": "John Doe" }
126
+ ]
127
+ }
128
+ ]
129
+ }
130
+ """
131
+
132
  def get_text_prompt_with_fallback_date():
133
  """
134
  Generate prompt for raw text snippets where context might be missing.
 
142
  """
143
 
144
  # -------------------------------------------------------------------------
145
+ # CATEGORIZATION LOGIC - TYPE-BASED
146
  # -------------------------------------------------------------------------
147
 
148
  def categorize_transaction(transaction):
149
  """
150
  Categorizes a transaction based strictly on its Type field.
 
 
 
 
 
 
 
151
  """
152
  tx_type = transaction.get('Type', '').lower()
153
  description = transaction.get('Description', '').lower()
154
  destination = transaction.get('Destination_of_funds', '').lower()
155
+
 
156
  account_category = "Uncategorized"
157
+
 
158
  if tx_type == 'income':
 
159
  if any(keyword in description for keyword in ['sales', 'service', 'revenue', 'invoice']):
160
  account_category = "Sales Revenue"
161
  elif any(keyword in description for keyword in ['interest', 'dividend']):
162
  account_category = "Interest Income"
163
  elif any(keyword in description for keyword in ['transfer', 'deposit', 'payment']):
 
164
  account_category = "Other Income"
165
  else:
166
  account_category = "Other Income"
167
+
 
168
  elif tx_type == 'expense':
 
 
 
 
169
  if 'salaries' in destination or 'wages' in destination or 'salary' in description:
170
  account_category = "Salaries and Wages"
171
  elif 'water' in destination or 'electricity' in destination:
 
200
  account_category = "Travel and Accommodation"
201
  elif 'depreciation' in destination:
202
  account_category = "Depreciation"
 
 
203
  elif 'atm' in description and 'cash' in description:
 
204
  account_category = "Owner's Drawings"
205
  elif 'payment to' in description:
 
206
  if any(word in description for word in ['fabric', 'printing', 'material']):
207
  account_category = "Cost of Sales"
208
  else:
209
  account_category = "Miscellaneous Expense"
210
  else:
211
  account_category = "Miscellaneous Expense"
212
+
 
213
  elif tx_type == 'asset':
214
  if 'equipment' in destination or 'equipment' in description:
215
  account_category = "Equipment"
 
223
  account_category = "Furniture"
224
  else:
225
  account_category = "Other Assets"
226
+
 
227
  elif tx_type == 'liability':
228
  if 'bank loan' in destination or 'loan' in description:
229
  account_category = "Bank Loan"
 
231
  account_category = "Credit Facility"
232
  else:
233
  account_category = "Other Liabilities"
234
+
 
235
  elif tx_type == 'equity':
236
  if 'owner' in destination or 'capital' in description:
237
  account_category = "Owner Investment"
 
239
  account_category = "Retained Earnings"
240
  else:
241
  account_category = "Other Equity"
242
+
 
243
  elif tx_type == 'transfer':
244
  account_category = "Internal Transfer"
245
+
 
246
  elif tx_type == 'investment':
247
  if 'securities' in destination or 'stock' in description:
248
  account_category = "Securities"
 
250
  account_category = "Mutual Funds"
251
  else:
252
  account_category = "Other Investments"
253
+
 
254
  elif tx_type == 'loan_repayment':
255
  account_category = "Loan Repayment"
256
+
 
257
  elif tx_type == 'capital_injection':
258
  account_category = "Capital Injection"
259
+
 
260
  transaction['Account_Category'] = account_category
261
  return transaction
262
 
 
266
 
267
  def extract_json_from_response(response_text):
268
  """Extract valid JSON from Gemini's response, handling Markdown fences."""
 
269
  cleaned_text = re.sub(r'```json\s*', '', response_text)
270
  cleaned_text = re.sub(r'```\s*', '', cleaned_text)
271
+
 
272
  match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
273
  if match:
274
  json_string = match.group(1)
275
  else:
 
276
  json_string = cleaned_text
277
 
278
  try:
279
  return json.loads(json_string)
280
  except json.JSONDecodeError:
281
  logging.warning("JSON parsing failed, attempting repair.")
282
+ raise ValueError(json_string)
283
 
284
+ def repair_json_with_gemini(model, broken_json_string, fallback_key="transactions"):
285
  """Uses Gemini to fix broken JSON syntax."""
286
  repair_prompt = f"""Fix this broken JSON string. Return ONLY valid JSON.
287
+ Broken JSON: {broken_json_string}"""
288
  try:
289
  resp = model.generate_content(repair_prompt)
290
  return extract_json_from_response(resp.text)
291
  except Exception as e:
292
  logging.error(f"JSON repair failed: {e}")
293
+ return {fallback_key: []}
294
 
295
  def call_gemini_with_retry(model, content, prompt, retries=2):
296
  """
297
+ Generic runner for financial Gemini extraction.
 
 
298
  """
299
  for attempt in range(retries + 1):
300
  try:
 
301
  response = model.generate_content([prompt, content])
302
+
303
  try:
304
  result = extract_json_from_response(response.text)
305
+
 
306
  if 'transactions' in result:
307
  result['transactions'] = [
308
  categorize_transaction(tx) for tx in result['transactions']
309
  ]
310
+
311
  return result
312
  except ValueError as ve:
 
313
  broken_json = str(ve)
314
+ repaired = repair_json_with_gemini(model, broken_json, fallback_key="transactions")
315
+
 
316
  if 'transactions' in repaired:
317
  repaired['transactions'] = [
318
  categorize_transaction(tx) for tx in repaired['transactions']
319
  ]
320
+
321
  return repaired
322
+
323
  except Exception as e:
324
  if "429" in str(e) or "ResourceExhausted" in str(e):
325
  time.sleep(2 * (attempt + 1))
 
330
 
331
  return {"transactions": []}
332
 
333
+ def call_gemini_students_with_retry(model, content, prompt, retries=2):
334
+ """
335
+ Generic runner for student Gemini extraction.
336
+ """
337
+ for attempt in range(retries + 1):
338
+ try:
339
+ response = model.generate_content([prompt, content])
340
+
341
+ try:
342
+ result = extract_json_from_response(response.text)
343
+ if 'students' not in result or not isinstance(result.get('students'), list):
344
+ return {"students": []}
345
+ return result
346
+ except ValueError as ve:
347
+ broken_json = str(ve)
348
+ repaired = repair_json_with_gemini(model, broken_json, fallback_key="students")
349
+ if 'students' not in repaired or not isinstance(repaired.get('students'), list):
350
+ return {"students": []}
351
+ return repaired
352
+
353
+ except Exception as e:
354
+ if "429" in str(e) or "ResourceExhausted" in str(e):
355
+ time.sleep(2 * (attempt + 1))
356
+ continue
357
+ logging.error(f"Gemini Student Import Error: {e}")
358
+ if attempt == retries:
359
+ raise
360
+
361
+ return {"students": []}
362
+
363
  def is_file_empty(file_path):
364
  """Check if file is empty."""
365
  return os.path.getsize(file_path) == 0
366
 
367
+ def parse_json_safely(value, default=None):
368
+ if default is None:
369
+ default = {}
370
+ if value is None:
371
+ return default
372
+ if isinstance(value, (dict, list)):
373
+ return value
374
+ if not isinstance(value, str):
375
+ return default
376
+ value = value.strip()
377
+ if not value:
378
+ return default
379
+ try:
380
+ return json.loads(value)
381
+ except Exception:
382
+ return default
383
+
384
+ def normalize_key(key: str) -> str:
385
+ return re.sub(r"[^a-z0-9]+", "_", str(key).strip().lower()).strip("_")
386
+
387
+ def title_case_name(value: str) -> str:
388
+ value = re.sub(r"\s+", " ", str(value or "").strip())
389
+ return value.title() if value else ""
390
+
391
+ def normalize_gender(value: str) -> str:
392
+ raw = str(value or "").strip().lower()
393
+ if raw in {"m", "male", "boy"}:
394
+ return "Male"
395
+ if raw in {"f", "female", "girl"}:
396
+ return "Female"
397
+ if not raw:
398
+ return ""
399
+ return str(value).strip().title()
400
+
401
+ def ensure_extra_fields_list(value):
402
+ if isinstance(value, list):
403
+ cleaned = []
404
+ for item in value:
405
+ if isinstance(item, dict):
406
+ name = str(item.get("name", "")).strip()
407
+ val = str(item.get("value", "")).strip()
408
+ if name:
409
+ cleaned.append({"name": name, "value": val})
410
+ return cleaned
411
+ return []
412
+
413
+ def build_student_prompt(template_fields=None):
414
+ template_fields = template_fields or {}
415
+
416
+ extra_context = {
417
+ "global_defaults": {
418
+ "grade": template_fields.get("grade"),
419
+ "class_name": template_fields.get("class_name") or template_fields.get("class"),
420
+ "gender": template_fields.get("gender"),
421
+ },
422
+ "generate_admission_numbers": bool(template_fields.get("generate_admission_numbers")),
423
+ "admission_prefix": template_fields.get("admission_prefix", "ADM"),
424
+ "ai_instructions": template_fields.get("ai_instructions", ""),
425
+ "expected_fields": template_fields.get("expected_fields", []),
426
+ "custom_fields": template_fields.get("custom_fields", []),
427
+ }
428
+
429
+ return f"""{STUDENT_IMPORT_PROMPT}
430
+
431
+ PRE-IMPORT CONFIGURATION:
432
+ {json.dumps(extra_context, ensure_ascii=False)}
433
+ """
434
+
435
+ def normalize_student_record(student, template_fields=None, sequence=None):
436
+ """
437
+ Normalizes one parsed student record into the required shape.
438
+ """
439
+ template_fields = template_fields or {}
440
+ raw = student or {}
441
+
442
+ mapped = {}
443
+ extra_fields = []
444
+
445
+ alias_map = {
446
+ "name": ["name", "student_name", "full_name", "learner_name", "pupil_name"],
447
+ "admission_number": [
448
+ "admission_number", "admission_no", "admission_no_", "student_no",
449
+ "student_number", "reg_no", "registration_number"
450
+ ],
451
+ "class_name": ["class_name", "class", "stream", "classroom"],
452
+ "grade": ["grade", "form", "level"],
453
+ "gender": ["gender", "sex"],
454
+ "email": ["email", "email_address"],
455
+ "phone_number": ["phone_number", "phone", "mobile", "contact", "contact_number"],
456
+ }
457
+
458
+ normalized_raw = {normalize_key(k): v for k, v in raw.items()}
459
+
460
+ for canonical, aliases in alias_map.items():
461
+ for alias in aliases:
462
+ if alias in normalized_raw and str(normalized_raw[alias]).strip():
463
+ mapped[canonical] = str(normalized_raw[alias]).strip()
464
+ break
465
+
466
+ for key, value in normalized_raw.items():
467
+ if key in {a for aliases in alias_map.values() for a in aliases}:
468
+ continue
469
+ if key == "extra_fields":
470
+ continue
471
+ if value is None or str(value).strip() == "":
472
+ continue
473
+ extra_fields.append({
474
+ "name": key,
475
+ "value": str(value).strip()
476
+ })
477
+
478
+ extra_fields.extend(ensure_extra_fields_list(raw.get("extra_fields")))
479
+
480
+ mapped["name"] = title_case_name(mapped.get("name", ""))
481
+ mapped["class_name"] = mapped.get("class_name") or str(
482
+ template_fields.get("class_name") or template_fields.get("class") or ""
483
+ ).strip()
484
+ mapped["grade"] = mapped.get("grade") or str(template_fields.get("grade") or "").strip()
485
+ mapped["gender"] = normalize_gender(mapped.get("gender") or template_fields.get("gender") or "")
486
+ mapped["email"] = str(mapped.get("email", "")).strip()
487
+ mapped["phone_number"] = str(mapped.get("phone_number", "")).strip()
488
+
489
+ if not mapped.get("admission_number") and template_fields.get("generate_admission_numbers"):
490
+ prefix = str(template_fields.get("admission_prefix") or "ADM").strip() or "ADM"
491
+ start = int(template_fields.get("admission_start", 1) or 1)
492
+ width = int(template_fields.get("admission_width", 3) or 3)
493
+ serial = start + ((sequence or 1) - 1)
494
+ mapped["admission_number"] = f"{prefix}-{str(serial).zfill(width)}"
495
+ else:
496
+ mapped["admission_number"] = str(mapped.get("admission_number", "")).strip()
497
+
498
+ existing_extra = {normalize_key(x["name"]): True for x in extra_fields if x.get("name")}
499
+ for item in template_fields.get("custom_fields", []) or []:
500
+ if not isinstance(item, dict):
501
+ continue
502
+ name = str(item.get("name", "")).strip()
503
+ value = str(item.get("value", "")).strip()
504
+ if name and normalize_key(name) not in existing_extra and value:
505
+ extra_fields.append({"name": name, "value": value})
506
+
507
+ cleaned = {
508
+ "name": mapped.get("name", ""),
509
+ "admission_number": mapped.get("admission_number", ""),
510
+ "class_name": mapped.get("class_name", ""),
511
+ "grade": mapped.get("grade", ""),
512
+ "gender": mapped.get("gender", ""),
513
+ "email": mapped.get("email", ""),
514
+ "phone_number": mapped.get("phone_number", ""),
515
+ "extra_fields": extra_fields
516
+ }
517
+
518
+ return cleaned
519
+
520
+ def validate_student_records(students):
521
+ """
522
+ Business rules:
523
+ - name is required
524
+ - admission_number must be unique if present
525
+ """
526
+ validated = []
527
+ errors = []
528
+ seen_admission_numbers = set()
529
+
530
+ for index, student in enumerate(students):
531
+ row_errors = []
532
+
533
+ name = str(student.get("name", "")).strip()
534
+ admission_number = str(student.get("admission_number", "")).strip()
535
+
536
+ if not name:
537
+ row_errors.append("name is required")
538
+
539
+ if admission_number:
540
+ key = admission_number.lower()
541
+ if key in seen_admission_numbers:
542
+ row_errors.append("admission_number must be unique")
543
+ else:
544
+ seen_admission_numbers.add(key)
545
+
546
+ item = dict(student)
547
+ item["_row"] = index + 1
548
+ item["_valid"] = len(row_errors) == 0
549
+ item["_errors"] = row_errors
550
+
551
+ if row_errors:
552
+ errors.append({
553
+ "row": index + 1,
554
+ "student": item,
555
+ "errors": row_errors
556
+ })
557
+
558
+ validated.append(item)
559
+
560
+ return validated, errors
561
+
562
+ def dedupe_students(students):
563
+ """
564
+ Basic dedupe within current import.
565
+ Prefers admission_number when available, otherwise name+class+grade.
566
+ """
567
+ unique = []
568
+ seen = set()
569
+
570
+ for student in students:
571
+ admission_number = str(student.get("admission_number", "")).strip().lower()
572
+ name = str(student.get("name", "")).strip().lower()
573
+ class_name = str(student.get("class_name", "")).strip().lower()
574
+ grade = str(student.get("grade", "")).strip().lower()
575
+
576
+ key = (
577
+ f"adm:{admission_number}"
578
+ if admission_number
579
+ else f"name:{name}|class:{class_name}|grade:{grade}"
580
+ )
581
+
582
+ if key in seen:
583
+ continue
584
+ seen.add(key)
585
+ unique.append(student)
586
+
587
+ return unique
588
+
589
+ def allowed_student_import_file(filename):
590
+ ext = os.path.splitext(filename.lower())[1]
591
+ return ext in {".jpg", ".jpeg", ".png", ".webp", ".pdf", ".xlsx", ".xls", ".csv"}
592
+
593
+ def parse_students_from_dataframe(df, template_fields=None):
594
+ template_fields = template_fields or {}
595
+ df = df.copy()
596
+
597
+ df = df.dropna(how="all")
598
+ df = df.dropna(axis=1, how="all")
599
+
600
+ raw_students = []
601
+ for _, row in df.iterrows():
602
+ raw = {}
603
+ for col in df.columns:
604
+ value = row[col]
605
+ if pd.isna(value):
606
+ continue
607
+ raw[str(col)] = str(value).strip()
608
+ if raw:
609
+ raw_students.append(raw)
610
+
611
+ normalized = [
612
+ normalize_student_record(student, template_fields=template_fields, sequence=i + 1)
613
+ for i, student in enumerate(raw_students)
614
+ ]
615
+ return normalized
616
+
617
+ def process_student_pdf_page_as_image(model, pdf_path, page_num, template_fields=None):
618
+ if not PDF_IMAGE_SUPPORT:
619
+ raise ImportError("pdf2image/poppler not installed")
620
+
621
+ images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
622
+ if not images:
623
+ return []
624
+
625
+ prompt = build_student_prompt(template_fields)
626
+ result = call_gemini_students_with_retry(model, images[0], prompt)
627
+ students = result.get('students', []) or []
628
+
629
+ return [
630
+ normalize_student_record(student, template_fields=template_fields, sequence=i + 1)
631
+ for i, student in enumerate(students)
632
+ ]
633
+
634
+ def parse_students_from_pdf(model, pdf_path, template_fields=None):
635
+ template_fields = template_fields or {}
636
+ all_students = []
637
+ prompt = build_student_prompt(template_fields)
638
+
639
+ try:
640
+ reader = pypdf.PdfReader(pdf_path)
641
+ num_pages = len(reader.pages)
642
+
643
+ for i in range(num_pages):
644
+ logging.info(f"Processing student PDF page {i+1}/{num_pages}")
645
+
646
+ try:
647
+ text_content = reader.pages[i].extract_text() or ""
648
+ except Exception:
649
+ text_content = ""
650
+
651
+ if text_content and len(text_content.strip()) > 50:
652
+ result = call_gemini_students_with_retry(model, text_content, prompt)
653
+ page_students = result.get('students', []) or []
654
+ all_students.extend(page_students)
655
+ else:
656
+ if PDF_IMAGE_SUPPORT:
657
+ page_students = process_student_pdf_page_as_image(
658
+ model, pdf_path, i + 1, template_fields=template_fields
659
+ )
660
+ all_students.extend(page_students)
661
+ else:
662
+ logging.warning("Skipped scanned PDF page because pdf2image is unavailable.")
663
+
664
+ except pypdf.errors.PdfReadError:
665
+ logging.warning("pypdf failed to read student PDF. Attempting full Vision fallback.")
666
+ if not PDF_IMAGE_SUPPORT:
667
+ raise ValueError("Unreadable PDF and pdf2image fallback unavailable.")
668
+
669
+ images = convert_from_path(pdf_path)
670
+ for img in images:
671
+ result = call_gemini_students_with_retry(model, img, prompt)
672
+ all_students.extend(result.get('students', []) or [])
673
+
674
+ normalized = [
675
+ normalize_student_record(student, template_fields=template_fields, sequence=i + 1)
676
+ for i, student in enumerate(all_students)
677
+ ]
678
+ return normalized
679
+
680
+ def parse_students_from_image_file(model, file_path, template_fields=None):
681
+ template_fields = template_fields or {}
682
+ prompt = build_student_prompt(template_fields)
683
+
684
+ img = Image.open(file_path)
685
+ result = call_gemini_students_with_retry(model, img, prompt)
686
+ students = result.get('students', []) or []
687
+
688
+ return [
689
+ normalize_student_record(student, template_fields=template_fields, sequence=i + 1)
690
+ for i, student in enumerate(students)
691
+ ]
692
+
693
+ def read_spreadsheet_students(file_path, filename, template_fields=None):
694
+ ext = os.path.splitext(filename.lower())[1]
695
+
696
+ if ext == ".csv":
697
+ df = pd.read_csv(file_path)
698
+ elif ext in {".xlsx", ".xls"}:
699
+ df = pd.read_excel(file_path)
700
+ else:
701
+ raise ValueError("Unsupported spreadsheet format")
702
+
703
+ return parse_students_from_dataframe(df, template_fields=template_fields)
704
+
705
  # -------------------------------------------------------------------------
706
  # CORE LOGIC: PDF PROCESSING (HYBRID TEXT + VISION)
707
  # -------------------------------------------------------------------------
708
 
709
  def process_pdf_page_as_image(model, pdf_path, page_num):
710
+ """Convert specific PDF page to image and process with Vision."""
711
  if not PDF_IMAGE_SUPPORT:
712
  raise ImportError("pdf2image/poppler not installed")
713
 
 
 
714
  images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
715
  if not images:
716
  return []
717
+
 
718
  result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
719
  return result.get('transactions', [])
720
 
 
723
  """
724
  Smart PDF Processor:
725
  1. Checks if empty.
726
+ 2. Tries standard Text extraction.
727
+ 3. If Text fails or is empty, falls back to Vision.
728
  """
729
  temp_path = None
730
  try:
 
731
  if 'file' not in request.files:
732
  return jsonify({'error': 'No file uploaded'}), 400
733
  file = request.files['file']
734
  if file.filename == '':
735
  return jsonify({'error': 'No file selected'}), 400
736
 
 
737
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
738
  file.save(tmp.name)
739
  temp_path = tmp.name
740
 
 
741
  if is_file_empty(temp_path):
742
+ return jsonify({'error': 'Uploaded file is empty'}), 400
743
 
744
  model = configure_gemini(api_key)
745
  all_transactions = []
746
+
 
747
  try:
748
  reader = pypdf.PdfReader(temp_path)
749
  num_pages = len(reader.pages)
750
+
751
  for i in range(num_pages):
752
  logging.info(f"Processing page {i+1}/{num_pages}")
753
+
 
754
  try:
755
  text_content = reader.pages[i].extract_text()
756
  except Exception:
757
+ text_content = ""
758
 
 
759
  if text_content and len(text_content.strip()) > 50:
 
760
  logging.info("Text detected. Using Text Strategy.")
761
  result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT)
762
  else:
 
763
  logging.info("Low text/Encryption detected. Switching to Vision Strategy.")
764
  if PDF_IMAGE_SUPPORT:
765
+ txs = process_pdf_page_as_image(model, temp_path, i + 1)
 
 
766
  all_transactions.extend(txs)
767
+ continue
768
  else:
769
  logging.warning("Cannot process scanned PDF - pdf2image missing.")
770
  result = {"transactions": []}
 
773
  all_transactions.extend(txs)
774
 
775
  except pypdf.errors.PdfReadError:
 
776
  logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
777
  if PDF_IMAGE_SUPPORT:
 
778
  images = convert_from_path(temp_path)
779
  for img in images:
780
  result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
 
792
  os.remove(temp_path)
793
 
794
  # -------------------------------------------------------------------------
795
+ # TEXT & IMAGE ENDPOINTS
796
  # -------------------------------------------------------------------------
797
 
798
  @app.route('/process-text', methods=['POST'])
 
802
  data = request.get_json()
803
  if not data or 'text' not in data:
804
  return jsonify({'error': 'No text provided'}), 400
805
+
806
  text_input = data['text']
807
  if not text_input.strip():
808
+ return jsonify({'error': 'Text input cannot be empty'}), 400
809
+
810
  model = configure_gemini(api_key)
 
811
  prompt = get_text_prompt_with_fallback_date()
812
+
813
  result = call_gemini_with_retry(model, text_input, prompt)
814
  return jsonify({'transactions': result.get('transactions', [])})
815
+
816
  except Exception as e:
817
  logging.error(f"Error: {e}")
818
  return jsonify({'error': str(e)}), 500
 
825
  if 'file' not in request.files:
826
  return jsonify({'error': 'No file uploaded'}), 400
827
  file = request.files['file']
828
+
 
829
  file.seek(0, os.SEEK_END)
830
  size = file.tell()
831
  file.seek(0)
 
837
  temp_path = tmp.name
838
 
839
  model = configure_gemini(api_key)
 
 
840
  img = Image.open(temp_path)
 
 
841
  result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
842
+
843
  return jsonify({'transactions': result.get('transactions', [])})
844
+
845
  except Exception as e:
846
  logging.error(f"Error: {e}")
847
  return jsonify({'error': str(e)}), 500
 
849
  if temp_path and os.path.exists(temp_path):
850
  os.remove(temp_path)
851
 
852
+ # -------------------------------------------------------------------------
853
+ # STUDENT IMPORT ENDPOINTS
854
+ # -------------------------------------------------------------------------
855
+
856
+ @app.route('/api/customers/parse-students-images', methods=['POST'])
857
+ def parse_students_images():
858
+ """
859
+ Supports:
860
+ - images
861
+ - PDFs
862
+ - CSV
863
+ - XLSX
864
+ - camera-captured images
865
+
866
+ multipart/form-data:
867
+ - files
868
+ - template_fields (JSON string)
869
+ """
870
+ temp_paths = []
871
+
872
+ try:
873
+ uploaded_files = request.files.getlist("files")
874
+ if not uploaded_files:
875
+ return jsonify({"error": "No files uploaded"}), 400
876
+
877
+ template_fields = parse_json_safely(request.form.get("template_fields"), default={})
878
+ model = configure_gemini(api_key)
879
+
880
+ all_students = []
881
+ file_summaries = []
882
+
883
+ for uploaded_file in uploaded_files:
884
+ if not uploaded_file or uploaded_file.filename == "":
885
+ continue
886
+
887
+ if not allowed_student_import_file(uploaded_file.filename):
888
+ file_summaries.append({
889
+ "file": uploaded_file.filename,
890
+ "students_extracted": 0,
891
+ "status": "skipped",
892
+ "reason": "unsupported file type"
893
+ })
894
+ continue
895
+
896
+ with tempfile.NamedTemporaryFile(
897
+ delete=False,
898
+ suffix=os.path.splitext(uploaded_file.filename)[1]
899
+ ) as tmp:
900
+ uploaded_file.save(tmp.name)
901
+ temp_paths.append(tmp.name)
902
+ temp_path = tmp.name
903
+
904
+ if os.path.getsize(temp_path) == 0:
905
+ file_summaries.append({
906
+ "file": uploaded_file.filename,
907
+ "students_extracted": 0,
908
+ "status": "skipped",
909
+ "reason": "empty file"
910
+ })
911
+ continue
912
+
913
+ ext = os.path.splitext(uploaded_file.filename.lower())[1]
914
+ parsed_students = []
915
+
916
+ if ext in {".jpg", ".jpeg", ".png", ".webp"}:
917
+ parsed_students = parse_students_from_image_file(
918
+ model, temp_path, template_fields=template_fields
919
+ )
920
+ elif ext == ".pdf":
921
+ parsed_students = parse_students_from_pdf(
922
+ model, temp_path, template_fields=template_fields
923
+ )
924
+ elif ext in {".csv", ".xlsx", ".xls"}:
925
+ parsed_students = read_spreadsheet_students(
926
+ temp_path, uploaded_file.filename, template_fields=template_fields
927
+ )
928
+
929
+ file_summaries.append({
930
+ "file": uploaded_file.filename,
931
+ "students_extracted": len(parsed_students),
932
+ "status": "processed"
933
+ })
934
+
935
+ all_students.extend(parsed_students)
936
+
937
+ all_students = dedupe_students(all_students)
938
+ validated_students, validation_errors = validate_student_records(all_students)
939
+
940
+ valid_students = [s for s in validated_students if s["_valid"]]
941
+ invalid_students = [s for s in validated_students if not s["_valid"]]
942
+
943
+ return jsonify({
944
+ "students": validated_students,
945
+ "summary": {
946
+ "files_received": len(uploaded_files),
947
+ "files_processed": len([x for x in file_summaries if x["status"] == "processed"]),
948
+ "total_students_extracted": len(all_students),
949
+ "valid_students": len(valid_students),
950
+ "invalid_students": len(invalid_students)
951
+ },
952
+ "file_summaries": file_summaries,
953
+ "validation_errors": validation_errors
954
+ })
955
+
956
+ except Exception as e:
957
+ logging.error(f"Student import server error: {e}")
958
+ return jsonify({"error": str(e)}), 500
959
+
960
+ finally:
961
+ for path in temp_paths:
962
+ try:
963
+ if path and os.path.exists(path):
964
+ os.remove(path)
965
+ except Exception:
966
+ pass
967
+
968
+ @app.route('/api/customers/validate-students-import', methods=['POST'])
969
+ def validate_students_import():
970
+ """
971
+ Accepts already-parsed student rows from the preview table.
972
+ Useful before save.
973
+ """
974
+ try:
975
+ data = request.get_json(silent=True) or {}
976
+ students = data.get("students", [])
977
+
978
+ if not isinstance(students, list):
979
+ return jsonify({"error": "students must be an array"}), 400
980
+
981
+ normalized = [
982
+ normalize_student_record(student, template_fields={}, sequence=i + 1)
983
+ for i, student in enumerate(students)
984
+ ]
985
+ normalized = dedupe_students(normalized)
986
+ validated_students, validation_errors = validate_student_records(normalized)
987
+
988
+ return jsonify({
989
+ "students": validated_students,
990
+ "valid": len(validation_errors) == 0,
991
+ "validation_errors": validation_errors
992
+ })
993
+
994
+ except Exception as e:
995
+ logging.error(f"Student validation error: {e}")
996
+ return jsonify({"error": str(e)}), 500
997
+
998
+ @app.route('/api/customers/parse-students-manual', methods=['POST'])
999
+ def parse_students_manual():
1000
+ """
1001
+ For manual entry from UI.
1002
+ Sends rows through the same normalization + validation pipeline.
1003
+ """
1004
+ try:
1005
+ data = request.get_json(silent=True) or {}
1006
+ students = data.get("students", [])
1007
+ template_fields = data.get("template_fields", {}) or {}
1008
+
1009
+ if not isinstance(students, list):
1010
+ return jsonify({"error": "students must be an array"}), 400
1011
+
1012
+ normalized = [
1013
+ normalize_student_record(student, template_fields=template_fields, sequence=i + 1)
1014
+ for i, student in enumerate(students)
1015
+ ]
1016
+ normalized = dedupe_students(normalized)
1017
+ validated_students, validation_errors = validate_student_records(normalized)
1018
+
1019
+ return jsonify({
1020
+ "students": validated_students,
1021
+ "validation_errors": validation_errors
1022
+ })
1023
+
1024
+ except Exception as e:
1025
+ logging.error(f"Manual student parse error: {e}")
1026
+ return jsonify({"error": str(e)}), 500
1027
+
1028
+ # -------------------------------------------------------------------------
1029
+ # OTHER ENDPOINTS
1030
+ # -------------------------------------------------------------------------
1031
+
1032
  @app.route('/transaction-types', methods=['GET'])
1033
  def get_transaction_types():
1034
  """Return available transaction types and their categories."""
 
1035
  transaction_types = {
1036
  "types": [
1037
  {
 
1105
  return jsonify({
1106
  'status': 'healthy',
1107
  'timestamp': datetime.now().isoformat(),
1108
+ 'version': '2.3.0',
1109
  'vision_support': PDF_IMAGE_SUPPORT
1110
  })
1111
 
1112
  if __name__ == '__main__':
 
1113
  app.run(debug=True, host="0.0.0.0", port=7860)