Spaces:

rairo
/

stmt-api

Running

App Files Files Community

rairo commited on Mar 31

Commit

b61acc0

verified ·

1 Parent(s): ce82bf0

Update main.py

Browse files

Files changed (1) hide show

main.py +637 -110

main.py CHANGED Viewed

@@ -32,14 +32,12 @@ CORS(app)
 # Get API key securely
 api_key = os.getenv('Gemini')
 if not api_key:
-    # Fallback for local testing if env var not set, though env var is preferred
     logging.warning("Gemini API key not found in environment variables.")
 def configure_gemini(api_key):
     """Configure Gemini AI model."""
     try:
         genai.configure(api_key=api_key)
-        # Using 2.0 Flash as it has superior vision and long-context capabilities
         return genai.GenerativeModel('gemini-2.0-flash')
     except Exception as e:
         logging.error(f"Error configuring Gemini: {str(e)}")
@@ -49,8 +47,6 @@ def configure_gemini(api_key):
 # PROMPTS
 # -------------------------------------------------------------------------
-# Enhanced Prompt for General Financial Documents (Statements, Invoices, Receipts)
-# Addresses Point 1 (Rounding/Dates) & Point 3 (Document Types)
 FINANCIAL_DOC_PROMPT = """Analyze this financial document (which could be a Bank Statement, Invoice, Receipt, or Transaction List).
 Extract all relevant transactions/items in JSON format.
@@ -90,6 +86,49 @@ RETURN STRUCTURE:
 Return ONLY raw JSON. No markdown formatting.
 """
 def get_text_prompt_with_fallback_date():
     """
     Generate prompt for raw text snippets where context might be missing.
@@ -103,46 +142,30 @@ If the text below does not specify a year or date, reasonable assume {current_da
 """
 # -------------------------------------------------------------------------
-# CATEGORIZATION LOGIC - TYPE-BASED (FIX FOR THE BUG)
 # -------------------------------------------------------------------------
 def categorize_transaction(transaction):
     """
     Categorizes a transaction based strictly on its Type field.
-    This prevents keyword-based misclassification.
-    Args:
-        transaction: dict with keys including 'Type', 'Description', 'Destination_of_funds'
-    Returns:
-        dict with added 'Account_Category' field
     """
     tx_type = transaction.get('Type', '').lower()
     description = transaction.get('Description', '').lower()
     destination = transaction.get('Destination_of_funds', '').lower()
-    # Add the categorized account field
     account_category = "Uncategorized"
-    # ========== INCOME TYPE ==========
     if tx_type == 'income':
-        # All income should map to revenue accounts, NOT expenses
         if any(keyword in description for keyword in ['sales', 'service', 'revenue', 'invoice']):
             account_category = "Sales Revenue"
         elif any(keyword in description for keyword in ['interest', 'dividend']):
             account_category = "Interest Income"
         elif any(keyword in description for keyword in ['transfer', 'deposit', 'payment']):
-            # This fixes the "Income Trap" - transfers FROM others are income
             account_category = "Other Income"
         else:
             account_category = "Other Income"
-    # ========== EXPENSE TYPE ==========
     elif tx_type == 'expense':
-        # Map based on Destination_of_funds or description keywords
-        # This is TYPE-FIRST, so "cash" in description won't make it an asset
-        # Specific expense categories based on your system
         if 'salaries' in destination or 'wages' in destination or 'salary' in description:
             account_category = "Salaries and Wages"
         elif 'water' in destination or 'electricity' in destination:
@@ -177,21 +200,16 @@ def categorize_transaction(transaction):
             account_category = "Travel and Accommodation"
         elif 'depreciation' in destination:
             account_category = "Depreciation"
-        # Special cases based on description (but still respecting expense type)
         elif 'atm' in description and 'cash' in description:
-            # This fixes the "Cash Trap" - ATM withdrawals are drawings, not assets
             account_category = "Owner's Drawings"
         elif 'payment to' in description:
-            # Payment to suppliers/vendors
             if any(word in description for word in ['fabric', 'printing', 'material']):
                 account_category = "Cost of Sales"
             else:
                 account_category = "Miscellaneous Expense"
         else:
             account_category = "Miscellaneous Expense"
-    # ========== ASSET TYPE ==========
     elif tx_type == 'asset':
         if 'equipment' in destination or 'equipment' in description:
             account_category = "Equipment"
@@ -205,8 +223,7 @@ def categorize_transaction(transaction):
             account_category = "Furniture"
         else:
             account_category = "Other Assets"
-    # ========== LIABILITY TYPE ==========
     elif tx_type == 'liability':
         if 'bank loan' in destination or 'loan' in description:
             account_category = "Bank Loan"
@@ -214,8 +231,7 @@ def categorize_transaction(transaction):
             account_category = "Credit Facility"
         else:
             account_category = "Other Liabilities"
-    # ========== EQUITY TYPE ==========
     elif tx_type == 'equity':
         if 'owner' in destination or 'capital' in description:
             account_category = "Owner Investment"
@@ -223,12 +239,10 @@ def categorize_transaction(transaction):
             account_category = "Retained Earnings"
         else:
             account_category = "Other Equity"
-    # ========== TRANSFER TYPE ==========
     elif tx_type == 'transfer':
         account_category = "Internal Transfer"
-    # ========== INVESTMENT TYPE ==========
     elif tx_type == 'investment':
         if 'securities' in destination or 'stock' in description:
             account_category = "Securities"
@@ -236,16 +250,13 @@ def categorize_transaction(transaction):
             account_category = "Mutual Funds"
         else:
             account_category = "Other Investments"
-    # ========== LOAN REPAYMENT TYPE ==========
     elif tx_type == 'loan_repayment':
         account_category = "Loan Repayment"
-    # ========== CAPITAL INJECTION TYPE ==========
     elif tx_type == 'capital_injection':
         account_category = "Capital Injection"
-    # Add the category to the transaction
     transaction['Account_Category'] = account_category
     return transaction
@@ -255,69 +266,60 @@ def categorize_transaction(transaction):
 def extract_json_from_response(response_text):
     """Extract valid JSON from Gemini's response, handling Markdown fences."""
-    # Remove markdown code blocks
     cleaned_text = re.sub(r'```json\s*', '', response_text)
     cleaned_text = re.sub(r'```\s*', '', cleaned_text)
-    # Find JSON object
     match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
     if match:
         json_string = match.group(1)
     else:
-        # Fallback: assume the whole text is JSON
         json_string = cleaned_text
     try:
         return json.loads(json_string)
     except json.JSONDecodeError:
         logging.warning("JSON parsing failed, attempting repair.")
-        raise ValueError(json_string) # Pass invalid string to caller for repair
-def repair_json_with_gemini(model, broken_json_string):
     """Uses Gemini to fix broken JSON syntax."""
     repair_prompt = f"""Fix this broken JSON string. Return ONLY valid JSON.
-    Broken JSON: {broken_json_string}"""
     try:
         resp = model.generate_content(repair_prompt)
         return extract_json_from_response(resp.text)
     except Exception as e:
         logging.error(f"JSON repair failed: {e}")
-        return {"transactions": []} # Fail safe
 def call_gemini_with_retry(model, content, prompt, retries=2):
     """
-    Generic runner for Gemini.
-    Args:
-        content: Can be a String (text) or a PIL.Image object (vision).
     """
     for attempt in range(retries + 1):
         try:
-            # Gemini Python SDK handles [Prompt, Image] or [Prompt, Text] automatically
             response = model.generate_content([prompt, content])
             try:
                 result = extract_json_from_response(response.text)
-                # POST-PROCESSING: Categorize each transaction based on Type
                 if 'transactions' in result:
                     result['transactions'] = [
                         categorize_transaction(tx) for tx in result['transactions']
                     ]
                 return result
             except ValueError as ve:
-                # Value error here contains the broken JSON string
                 broken_json = str(ve)
-                repaired = repair_json_with_gemini(model, broken_json)
-                # Categorize repaired transactions too
                 if 'transactions' in repaired:
                     repaired['transactions'] = [
                         categorize_transaction(tx) for tx in repaired['transactions']
                     ]
                 return repaired
         except Exception as e:
             if "429" in str(e) or "ResourceExhausted" in str(e):
                 time.sleep(2 * (attempt + 1))
@@ -328,26 +330,391 @@ def call_gemini_with_retry(model, content, prompt, retries=2):
     return {"transactions": []}
 def is_file_empty(file_path):
     """Check if file is empty."""
     return os.path.getsize(file_path) == 0
 # -------------------------------------------------------------------------
 # CORE LOGIC: PDF PROCESSING (HYBRID TEXT + VISION)
 # -------------------------------------------------------------------------
 def process_pdf_page_as_image(model, pdf_path, page_num):
-    """Point 4: Convert specific PDF page to image and process with Vision."""
     if not PDF_IMAGE_SUPPORT:
         raise ImportError("pdf2image/poppler not installed")
-    # Convert specific page to image
-    # first_page=page_num, last_page=page_num ensures we only convert 1 page at a time to save RAM
     images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
     if not images:
         return []
-    # Process the image
     result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
     return result.get('transactions', [])
@@ -356,58 +723,48 @@ def process_pdf():
     """
     Smart PDF Processor:
     1. Checks if empty.
-    2. Tries standard Text extraction (Fast/Cheap).
-    3. If Text fails (Encryption) or is empty (Scanned), falls back to Vision (Slow/Powerful).
     """
     temp_path = None
     try:
-        # 1. Validation
         if 'file' not in request.files:
             return jsonify({'error': 'No file uploaded'}), 400
         file = request.files['file']
         if file.filename == '':
             return jsonify({'error': 'No file selected'}), 400
-        # Save Temp
         with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
             file.save(tmp.name)
             temp_path = tmp.name
-        # Point 2: Empty File Check
         if is_file_empty(temp_path):
-             return jsonify({'error': 'Uploaded file is empty'}), 400
         model = configure_gemini(api_key)
         all_transactions = []
-        # Determine strategy: Try reading PDF structure first
         try:
             reader = pypdf.PdfReader(temp_path)
             num_pages = len(reader.pages)
             for i in range(num_pages):
                 logging.info(f"Processing page {i+1}/{num_pages}")
-                # Attempt Text Extraction
                 try:
                     text_content = reader.pages[i].extract_text()
                 except Exception:
-                    text_content = "" # Force fallback if extraction fails
-                # LOGIC: Check if text is sufficient. If < 50 chars, it's likely a scan or image-heavy.
                 if text_content and len(text_content.strip()) > 50:
-                    # Strategy A: Text Mode
                     logging.info("Text detected. Using Text Strategy.")
                     result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT)
                 else:
-                    # Strategy B: Vision Fallback (Point 4)
                     logging.info("Low text/Encryption detected. Switching to Vision Strategy.")
                     if PDF_IMAGE_SUPPORT:
-                        # Page numbers in pypdf are 0-indexed, pdf2image uses 1-based indexing often,
-                        # but convert_from_path handles slicing via first_page/last_page (1-based)
-                        txs = process_pdf_page_as_image(model, temp_path, i+1)
                         all_transactions.extend(txs)
-                        continue # Skip the rest of loop
                     else:
                         logging.warning("Cannot process scanned PDF - pdf2image missing.")
                         result = {"transactions": []}
@@ -416,10 +773,8 @@ def process_pdf():
                 all_transactions.extend(txs)
         except pypdf.errors.PdfReadError:
-            # If pypdf fails completely (e.g., highly corrupted or weird encryption), try Vision on whole file
             logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
             if PDF_IMAGE_SUPPORT:
-                # Warning: Processing all pages as images might be slow
                 images = convert_from_path(temp_path)
                 for img in images:
                     result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
@@ -437,7 +792,7 @@ def process_pdf():
             os.remove(temp_path)
 # -------------------------------------------------------------------------
-# TEXT & IMAGE ENDPOINTS (UPDATED)
 # -------------------------------------------------------------------------
 @app.route('/process-text', methods=['POST'])
@@ -447,18 +802,17 @@ def process_text():
         data = request.get_json()
         if not data or 'text' not in data:
             return jsonify({'error': 'No text provided'}), 400
         text_input = data['text']
         if not text_input.strip():
-            return jsonify({'error': 'Text input cannot be empty'}), 400 # Point 2
         model = configure_gemini(api_key)
-        # Use specific prompt with date fallback for raw text
         prompt = get_text_prompt_with_fallback_date()
         result = call_gemini_with_retry(model, text_input, prompt)
         return jsonify({'transactions': result.get('transactions', [])})
     except Exception as e:
         logging.error(f"Error: {e}")
         return jsonify({'error': str(e)}), 500
@@ -471,8 +825,7 @@ def process_image():
         if 'file' not in request.files:
             return jsonify({'error': 'No file uploaded'}), 400
         file = request.files['file']
-        # Point 2: Empty check
         file.seek(0, os.SEEK_END)
         size = file.tell()
         file.seek(0)
@@ -484,15 +837,11 @@ def process_image():
             temp_path = tmp.name
         model = configure_gemini(api_key)
-        # Load image with PIL
         img = Image.open(temp_path)
-        # Use the General Financial Prompt
         result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
         return jsonify({'transactions': result.get('transactions', [])})
     except Exception as e:
         logging.error(f"Error: {e}")
         return jsonify({'error': str(e)}), 500
@@ -500,10 +849,189 @@ def process_image():
         if temp_path and os.path.exists(temp_path):
             os.remove(temp_path)
 @app.route('/transaction-types', methods=['GET'])
 def get_transaction_types():
     """Return available transaction types and their categories."""
-    # Kept identical for backwards compatibility
     transaction_types = {
         "types": [
             {
@@ -577,10 +1105,9 @@ def health_check():
     return jsonify({
         'status': 'healthy',
         'timestamp': datetime.now().isoformat(),
-        'version': '2.2.0',
         'vision_support': PDF_IMAGE_SUPPORT
     })
 if __name__ == '__main__':
-    # Ensure this port matches your server configuration
     app.run(debug=True, host="0.0.0.0", port=7860)

 # Get API key securely
 api_key = os.getenv('Gemini')
 if not api_key:
     logging.warning("Gemini API key not found in environment variables.")
 def configure_gemini(api_key):
     """Configure Gemini AI model."""
     try:
         genai.configure(api_key=api_key)
         return genai.GenerativeModel('gemini-2.0-flash')
     except Exception as e:
         logging.error(f"Error configuring Gemini: {str(e)}")
 # PROMPTS
 # -------------------------------------------------------------------------
 FINANCIAL_DOC_PROMPT = """Analyze this financial document (which could be a Bank Statement, Invoice, Receipt, or Transaction List).
 Extract all relevant transactions/items in JSON format.
 Return ONLY raw JSON. No markdown formatting.
 """
+STUDENT_IMPORT_PROMPT = """Analyze this student document and extract student records into JSON.
+The document may be:
+- a class list
+- an admission register
+- a handwritten register
+- a scanned student form
+- a camera-captured document
+- a PDF page
+- an uploaded image
+RULES:
+1. Return ONLY raw JSON. No markdown.
+2. Extract as many student rows as possible.
+3. Support both printed and handwritten text.
+4. If a field is missing, return an empty string.
+5. Do not invent students.
+6. Ignore page numbers, signatures, totals, decorations, and repeated headers.
+7. Normalize similar fields as follows:
+   - class / stream / class_name -> class_name
+   - grade / form / level -> grade
+   - admission number / admission no / reg no / student no -> admission_number
+   - phone / mobile / contact -> phone_number
+RETURN STRUCTURE:
+{
+  "students": [
+    {
+      "name": "Student Name",
+      "admission_number": "ADM-001",
+      "class_name": "A",
+      "grade": "Grade 7",
+      "gender": "Female",
+      "email": "student@example.com",
+      "phone_number": "+2637...",
+      "extra_fields": [
+        { "name": "guardian_name", "value": "John Doe" }
+      ]
+    }
+  ]
+}
+"""
 def get_text_prompt_with_fallback_date():
     """
     Generate prompt for raw text snippets where context might be missing.
 """
 # -------------------------------------------------------------------------
+# CATEGORIZATION LOGIC - TYPE-BASED
 # -------------------------------------------------------------------------
 def categorize_transaction(transaction):
     """
     Categorizes a transaction based strictly on its Type field.
     """
     tx_type = transaction.get('Type', '').lower()
     description = transaction.get('Description', '').lower()
     destination = transaction.get('Destination_of_funds', '').lower()
     account_category = "Uncategorized"
     if tx_type == 'income':
         if any(keyword in description for keyword in ['sales', 'service', 'revenue', 'invoice']):
             account_category = "Sales Revenue"
         elif any(keyword in description for keyword in ['interest', 'dividend']):
             account_category = "Interest Income"
         elif any(keyword in description for keyword in ['transfer', 'deposit', 'payment']):
             account_category = "Other Income"
         else:
             account_category = "Other Income"
     elif tx_type == 'expense':
         if 'salaries' in destination or 'wages' in destination or 'salary' in description:
             account_category = "Salaries and Wages"
         elif 'water' in destination or 'electricity' in destination:
             account_category = "Travel and Accommodation"
         elif 'depreciation' in destination:
             account_category = "Depreciation"
         elif 'atm' in description and 'cash' in description:
             account_category = "Owner's Drawings"
         elif 'payment to' in description:
             if any(word in description for word in ['fabric', 'printing', 'material']):
                 account_category = "Cost of Sales"
             else:
                 account_category = "Miscellaneous Expense"
         else:
             account_category = "Miscellaneous Expense"
     elif tx_type == 'asset':
         if 'equipment' in destination or 'equipment' in description:
             account_category = "Equipment"
             account_category = "Furniture"
         else:
             account_category = "Other Assets"
     elif tx_type == 'liability':
         if 'bank loan' in destination or 'loan' in description:
             account_category = "Bank Loan"
             account_category = "Credit Facility"
         else:
             account_category = "Other Liabilities"
     elif tx_type == 'equity':
         if 'owner' in destination or 'capital' in description:
             account_category = "Owner Investment"
             account_category = "Retained Earnings"
         else:
             account_category = "Other Equity"
     elif tx_type == 'transfer':
         account_category = "Internal Transfer"
     elif tx_type == 'investment':
         if 'securities' in destination or 'stock' in description:
             account_category = "Securities"
             account_category = "Mutual Funds"
         else:
             account_category = "Other Investments"
     elif tx_type == 'loan_repayment':
         account_category = "Loan Repayment"
     elif tx_type == 'capital_injection':
         account_category = "Capital Injection"
     transaction['Account_Category'] = account_category
     return transaction
 def extract_json_from_response(response_text):
     """Extract valid JSON from Gemini's response, handling Markdown fences."""
     cleaned_text = re.sub(r'```json\s*', '', response_text)
     cleaned_text = re.sub(r'```\s*', '', cleaned_text)
     match = re.search(r'(\{.*\})', cleaned_text, re.DOTALL)
     if match:
         json_string = match.group(1)
     else:
         json_string = cleaned_text
     try:
         return json.loads(json_string)
     except json.JSONDecodeError:
         logging.warning("JSON parsing failed, attempting repair.")
+        raise ValueError(json_string)
+def repair_json_with_gemini(model, broken_json_string, fallback_key="transactions"):
     """Uses Gemini to fix broken JSON syntax."""
     repair_prompt = f"""Fix this broken JSON string. Return ONLY valid JSON.
+Broken JSON: {broken_json_string}"""
     try:
         resp = model.generate_content(repair_prompt)
         return extract_json_from_response(resp.text)
     except Exception as e:
         logging.error(f"JSON repair failed: {e}")
+        return {fallback_key: []}
 def call_gemini_with_retry(model, content, prompt, retries=2):
     """
+    Generic runner for financial Gemini extraction.
     """
     for attempt in range(retries + 1):
         try:
             response = model.generate_content([prompt, content])
             try:
                 result = extract_json_from_response(response.text)
                 if 'transactions' in result:
                     result['transactions'] = [
                         categorize_transaction(tx) for tx in result['transactions']
                     ]
                 return result
             except ValueError as ve:
                 broken_json = str(ve)
+                repaired = repair_json_with_gemini(model, broken_json, fallback_key="transactions")
                 if 'transactions' in repaired:
                     repaired['transactions'] = [
                         categorize_transaction(tx) for tx in repaired['transactions']
                     ]
                 return repaired
         except Exception as e:
             if "429" in str(e) or "ResourceExhausted" in str(e):
                 time.sleep(2 * (attempt + 1))
     return {"transactions": []}
+def call_gemini_students_with_retry(model, content, prompt, retries=2):
+    """
+    Generic runner for student Gemini extraction.
+    """
+    for attempt in range(retries + 1):
+        try:
+            response = model.generate_content([prompt, content])
+            try:
+                result = extract_json_from_response(response.text)
+                if 'students' not in result or not isinstance(result.get('students'), list):
+                    return {"students": []}
+                return result
+            except ValueError as ve:
+                broken_json = str(ve)
+                repaired = repair_json_with_gemini(model, broken_json, fallback_key="students")
+                if 'students' not in repaired or not isinstance(repaired.get('students'), list):
+                    return {"students": []}
+                return repaired
+        except Exception as e:
+            if "429" in str(e) or "ResourceExhausted" in str(e):
+                time.sleep(2 * (attempt + 1))
+                continue
+            logging.error(f"Gemini Student Import Error: {e}")
+            if attempt == retries:
+                raise
+    return {"students": []}
 def is_file_empty(file_path):
     """Check if file is empty."""
     return os.path.getsize(file_path) == 0
+def parse_json_safely(value, default=None):
+    if default is None:
+        default = {}
+    if value is None:
+        return default
+    if isinstance(value, (dict, list)):
+        return value
+    if not isinstance(value, str):
+        return default
+    value = value.strip()
+    if not value:
+        return default
+    try:
+        return json.loads(value)
+    except Exception:
+        return default
+def normalize_key(key: str) -> str:
+    return re.sub(r"[^a-z0-9]+", "_", str(key).strip().lower()).strip("_")
+def title_case_name(value: str) -> str:
+    value = re.sub(r"\s+", " ", str(value or "").strip())
+    return value.title() if value else ""
+def normalize_gender(value: str) -> str:
+    raw = str(value or "").strip().lower()
+    if raw in {"m", "male", "boy"}:
+        return "Male"
+    if raw in {"f", "female", "girl"}:
+        return "Female"
+    if not raw:
+        return ""
+    return str(value).strip().title()
+def ensure_extra_fields_list(value):
+    if isinstance(value, list):
+        cleaned = []
+        for item in value:
+            if isinstance(item, dict):
+                name = str(item.get("name", "")).strip()
+                val = str(item.get("value", "")).strip()
+                if name:
+                    cleaned.append({"name": name, "value": val})
+        return cleaned
+    return []
+def build_student_prompt(template_fields=None):
+    template_fields = template_fields or {}
+    extra_context = {
+        "global_defaults": {
+            "grade": template_fields.get("grade"),
+            "class_name": template_fields.get("class_name") or template_fields.get("class"),
+            "gender": template_fields.get("gender"),
+        },
+        "generate_admission_numbers": bool(template_fields.get("generate_admission_numbers")),
+        "admission_prefix": template_fields.get("admission_prefix", "ADM"),
+        "ai_instructions": template_fields.get("ai_instructions", ""),
+        "expected_fields": template_fields.get("expected_fields", []),
+        "custom_fields": template_fields.get("custom_fields", []),
+    }
+    return f"""{STUDENT_IMPORT_PROMPT}
+PRE-IMPORT CONFIGURATION:
+{json.dumps(extra_context, ensure_ascii=False)}
+"""
+def normalize_student_record(student, template_fields=None, sequence=None):
+    """
+    Normalizes one parsed student record into the required shape.
+    """
+    template_fields = template_fields or {}
+    raw = student or {}
+    mapped = {}
+    extra_fields = []
+    alias_map = {
+        "name": ["name", "student_name", "full_name", "learner_name", "pupil_name"],
+        "admission_number": [
+            "admission_number", "admission_no", "admission_no_", "student_no",
+            "student_number", "reg_no", "registration_number"
+        ],
+        "class_name": ["class_name", "class", "stream", "classroom"],
+        "grade": ["grade", "form", "level"],
+        "gender": ["gender", "sex"],
+        "email": ["email", "email_address"],
+        "phone_number": ["phone_number", "phone", "mobile", "contact", "contact_number"],
+    }
+    normalized_raw = {normalize_key(k): v for k, v in raw.items()}
+    for canonical, aliases in alias_map.items():
+        for alias in aliases:
+            if alias in normalized_raw and str(normalized_raw[alias]).strip():
+                mapped[canonical] = str(normalized_raw[alias]).strip()
+                break
+    for key, value in normalized_raw.items():
+        if key in {a for aliases in alias_map.values() for a in aliases}:
+            continue
+        if key == "extra_fields":
+            continue
+        if value is None or str(value).strip() == "":
+            continue
+        extra_fields.append({
+            "name": key,
+            "value": str(value).strip()
+        })
+    extra_fields.extend(ensure_extra_fields_list(raw.get("extra_fields")))
+    mapped["name"] = title_case_name(mapped.get("name", ""))
+    mapped["class_name"] = mapped.get("class_name") or str(
+        template_fields.get("class_name") or template_fields.get("class") or ""
+    ).strip()
+    mapped["grade"] = mapped.get("grade") or str(template_fields.get("grade") or "").strip()
+    mapped["gender"] = normalize_gender(mapped.get("gender") or template_fields.get("gender") or "")
+    mapped["email"] = str(mapped.get("email", "")).strip()
+    mapped["phone_number"] = str(mapped.get("phone_number", "")).strip()
+    if not mapped.get("admission_number") and template_fields.get("generate_admission_numbers"):
+        prefix = str(template_fields.get("admission_prefix") or "ADM").strip() or "ADM"
+        start = int(template_fields.get("admission_start", 1) or 1)
+        width = int(template_fields.get("admission_width", 3) or 3)
+        serial = start + ((sequence or 1) - 1)
+        mapped["admission_number"] = f"{prefix}-{str(serial).zfill(width)}"
+    else:
+        mapped["admission_number"] = str(mapped.get("admission_number", "")).strip()
+    existing_extra = {normalize_key(x["name"]): True for x in extra_fields if x.get("name")}
+    for item in template_fields.get("custom_fields", []) or []:
+        if not isinstance(item, dict):
+            continue
+        name = str(item.get("name", "")).strip()
+        value = str(item.get("value", "")).strip()
+        if name and normalize_key(name) not in existing_extra and value:
+            extra_fields.append({"name": name, "value": value})
+    cleaned = {
+        "name": mapped.get("name", ""),
+        "admission_number": mapped.get("admission_number", ""),
+        "class_name": mapped.get("class_name", ""),
+        "grade": mapped.get("grade", ""),
+        "gender": mapped.get("gender", ""),
+        "email": mapped.get("email", ""),
+        "phone_number": mapped.get("phone_number", ""),
+        "extra_fields": extra_fields
+    }
+    return cleaned
+def validate_student_records(students):
+    """
+    Business rules:
+    - name is required
+    - admission_number must be unique if present
+    """
+    validated = []
+    errors = []
+    seen_admission_numbers = set()
+    for index, student in enumerate(students):
+        row_errors = []
+        name = str(student.get("name", "")).strip()
+        admission_number = str(student.get("admission_number", "")).strip()
+        if not name:
+            row_errors.append("name is required")
+        if admission_number:
+            key = admission_number.lower()
+            if key in seen_admission_numbers:
+                row_errors.append("admission_number must be unique")
+            else:
+                seen_admission_numbers.add(key)
+        item = dict(student)
+        item["_row"] = index + 1
+        item["_valid"] = len(row_errors) == 0
+        item["_errors"] = row_errors
+        if row_errors:
+            errors.append({
+                "row": index + 1,
+                "student": item,
+                "errors": row_errors
+            })
+        validated.append(item)
+    return validated, errors
+def dedupe_students(students):
+    """
+    Basic dedupe within current import.
+    Prefers admission_number when available, otherwise name+class+grade.
+    """
+    unique = []
+    seen = set()
+    for student in students:
+        admission_number = str(student.get("admission_number", "")).strip().lower()
+        name = str(student.get("name", "")).strip().lower()
+        class_name = str(student.get("class_name", "")).strip().lower()
+        grade = str(student.get("grade", "")).strip().lower()
+        key = (
+            f"adm:{admission_number}"
+            if admission_number
+            else f"name:{name}|class:{class_name}|grade:{grade}"
+        )
+        if key in seen:
+            continue
+        seen.add(key)
+        unique.append(student)
+    return unique
+def allowed_student_import_file(filename):
+    ext = os.path.splitext(filename.lower())[1]
+    return ext in {".jpg", ".jpeg", ".png", ".webp", ".pdf", ".xlsx", ".xls", ".csv"}
+def parse_students_from_dataframe(df, template_fields=None):
+    template_fields = template_fields or {}
+    df = df.copy()
+    df = df.dropna(how="all")
+    df = df.dropna(axis=1, how="all")
+    raw_students = []
+    for _, row in df.iterrows():
+        raw = {}
+        for col in df.columns:
+            value = row[col]
+            if pd.isna(value):
+                continue
+            raw[str(col)] = str(value).strip()
+        if raw:
+            raw_students.append(raw)
+    normalized = [
+        normalize_student_record(student, template_fields=template_fields, sequence=i + 1)
+        for i, student in enumerate(raw_students)
+    ]
+    return normalized
+def process_student_pdf_page_as_image(model, pdf_path, page_num, template_fields=None):
+    if not PDF_IMAGE_SUPPORT:
+        raise ImportError("pdf2image/poppler not installed")
+    images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
+    if not images:
+        return []
+    prompt = build_student_prompt(template_fields)
+    result = call_gemini_students_with_retry(model, images[0], prompt)
+    students = result.get('students', []) or []
+    return [
+        normalize_student_record(student, template_fields=template_fields, sequence=i + 1)
+        for i, student in enumerate(students)
+    ]
+def parse_students_from_pdf(model, pdf_path, template_fields=None):
+    template_fields = template_fields or {}
+    all_students = []
+    prompt = build_student_prompt(template_fields)
+    try:
+        reader = pypdf.PdfReader(pdf_path)
+        num_pages = len(reader.pages)
+        for i in range(num_pages):
+            logging.info(f"Processing student PDF page {i+1}/{num_pages}")
+            try:
+                text_content = reader.pages[i].extract_text() or ""
+            except Exception:
+                text_content = ""
+            if text_content and len(text_content.strip()) > 50:
+                result = call_gemini_students_with_retry(model, text_content, prompt)
+                page_students = result.get('students', []) or []
+                all_students.extend(page_students)
+            else:
+                if PDF_IMAGE_SUPPORT:
+                    page_students = process_student_pdf_page_as_image(
+                        model, pdf_path, i + 1, template_fields=template_fields
+                    )
+                    all_students.extend(page_students)
+                else:
+                    logging.warning("Skipped scanned PDF page because pdf2image is unavailable.")
+    except pypdf.errors.PdfReadError:
+        logging.warning("pypdf failed to read student PDF. Attempting full Vision fallback.")
+        if not PDF_IMAGE_SUPPORT:
+            raise ValueError("Unreadable PDF and pdf2image fallback unavailable.")
+        images = convert_from_path(pdf_path)
+        for img in images:
+            result = call_gemini_students_with_retry(model, img, prompt)
+            all_students.extend(result.get('students', []) or [])
+    normalized = [
+        normalize_student_record(student, template_fields=template_fields, sequence=i + 1)
+        for i, student in enumerate(all_students)
+    ]
+    return normalized
+def parse_students_from_image_file(model, file_path, template_fields=None):
+    template_fields = template_fields or {}
+    prompt = build_student_prompt(template_fields)
+    img = Image.open(file_path)
+    result = call_gemini_students_with_retry(model, img, prompt)
+    students = result.get('students', []) or []
+    return [
+        normalize_student_record(student, template_fields=template_fields, sequence=i + 1)
+        for i, student in enumerate(students)
+    ]
+def read_spreadsheet_students(file_path, filename, template_fields=None):
+    ext = os.path.splitext(filename.lower())[1]
+    if ext == ".csv":
+        df = pd.read_csv(file_path)
+    elif ext in {".xlsx", ".xls"}:
+        df = pd.read_excel(file_path)
+    else:
+        raise ValueError("Unsupported spreadsheet format")
+    return parse_students_from_dataframe(df, template_fields=template_fields)
 # -------------------------------------------------------------------------
 # CORE LOGIC: PDF PROCESSING (HYBRID TEXT + VISION)
 # -------------------------------------------------------------------------
 def process_pdf_page_as_image(model, pdf_path, page_num):
+    """Convert specific PDF page to image and process with Vision."""
     if not PDF_IMAGE_SUPPORT:
         raise ImportError("pdf2image/poppler not installed")
     images = convert_from_path(pdf_path, first_page=page_num, last_page=page_num)
     if not images:
         return []
     result = call_gemini_with_retry(model, images[0], FINANCIAL_DOC_PROMPT)
     return result.get('transactions', [])
     """
     Smart PDF Processor:
     1. Checks if empty.
+    2. Tries standard Text extraction.
+    3. If Text fails or is empty, falls back to Vision.
     """
     temp_path = None
     try:
         if 'file' not in request.files:
             return jsonify({'error': 'No file uploaded'}), 400
         file = request.files['file']
         if file.filename == '':
             return jsonify({'error': 'No file selected'}), 400
         with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
             file.save(tmp.name)
             temp_path = tmp.name
         if is_file_empty(temp_path):
+            return jsonify({'error': 'Uploaded file is empty'}), 400
         model = configure_gemini(api_key)
         all_transactions = []
         try:
             reader = pypdf.PdfReader(temp_path)
             num_pages = len(reader.pages)
             for i in range(num_pages):
                 logging.info(f"Processing page {i+1}/{num_pages}")
                 try:
                     text_content = reader.pages[i].extract_text()
                 except Exception:
+                    text_content = ""
                 if text_content and len(text_content.strip()) > 50:
                     logging.info("Text detected. Using Text Strategy.")
                     result = call_gemini_with_retry(model, text_content, FINANCIAL_DOC_PROMPT)
                 else:
                     logging.info("Low text/Encryption detected. Switching to Vision Strategy.")
                     if PDF_IMAGE_SUPPORT:
+                        txs = process_pdf_page_as_image(model, temp_path, i + 1)
                         all_transactions.extend(txs)
+                        continue
                     else:
                         logging.warning("Cannot process scanned PDF - pdf2image missing.")
                         result = {"transactions": []}
                 all_transactions.extend(txs)
         except pypdf.errors.PdfReadError:
             logging.warning("pypdf failed to read file. Attempting full Vision fallback.")
             if PDF_IMAGE_SUPPORT:
                 images = convert_from_path(temp_path)
                 for img in images:
                     result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
             os.remove(temp_path)
 # -------------------------------------------------------------------------
+# TEXT & IMAGE ENDPOINTS
 # -------------------------------------------------------------------------
 @app.route('/process-text', methods=['POST'])
         data = request.get_json()
         if not data or 'text' not in data:
             return jsonify({'error': 'No text provided'}), 400
         text_input = data['text']
         if not text_input.strip():
+            return jsonify({'error': 'Text input cannot be empty'}), 400
         model = configure_gemini(api_key)
         prompt = get_text_prompt_with_fallback_date()
         result = call_gemini_with_retry(model, text_input, prompt)
         return jsonify({'transactions': result.get('transactions', [])})
     except Exception as e:
         logging.error(f"Error: {e}")
         return jsonify({'error': str(e)}), 500
         if 'file' not in request.files:
             return jsonify({'error': 'No file uploaded'}), 400
         file = request.files['file']
         file.seek(0, os.SEEK_END)
         size = file.tell()
         file.seek(0)
             temp_path = tmp.name
         model = configure_gemini(api_key)
         img = Image.open(temp_path)
         result = call_gemini_with_retry(model, img, FINANCIAL_DOC_PROMPT)
         return jsonify({'transactions': result.get('transactions', [])})
     except Exception as e:
         logging.error(f"Error: {e}")
         return jsonify({'error': str(e)}), 500
         if temp_path and os.path.exists(temp_path):
             os.remove(temp_path)
+# -------------------------------------------------------------------------
+# STUDENT IMPORT ENDPOINTS
+# -------------------------------------------------------------------------
+@app.route('/api/customers/parse-students-images', methods=['POST'])
+def parse_students_images():
+    """
+    Supports:
+    - images
+    - PDFs
+    - CSV
+    - XLSX
+    - camera-captured images
+    multipart/form-data:
+    - files
+    - template_fields (JSON string)
+    """
+    temp_paths = []
+    try:
+        uploaded_files = request.files.getlist("files")
+        if not uploaded_files:
+            return jsonify({"error": "No files uploaded"}), 400
+        template_fields = parse_json_safely(request.form.get("template_fields"), default={})
+        model = configure_gemini(api_key)
+        all_students = []
+        file_summaries = []
+        for uploaded_file in uploaded_files:
+            if not uploaded_file or uploaded_file.filename == "":
+                continue
+            if not allowed_student_import_file(uploaded_file.filename):
+                file_summaries.append({
+                    "file": uploaded_file.filename,
+                    "students_extracted": 0,
+                    "status": "skipped",
+                    "reason": "unsupported file type"
+                })
+                continue
+            with tempfile.NamedTemporaryFile(
+                delete=False,
+                suffix=os.path.splitext(uploaded_file.filename)[1]
+            ) as tmp:
+                uploaded_file.save(tmp.name)
+                temp_paths.append(tmp.name)
+                temp_path = tmp.name
+            if os.path.getsize(temp_path) == 0:
+                file_summaries.append({
+                    "file": uploaded_file.filename,
+                    "students_extracted": 0,
+                    "status": "skipped",
+                    "reason": "empty file"
+                })
+                continue
+            ext = os.path.splitext(uploaded_file.filename.lower())[1]
+            parsed_students = []
+            if ext in {".jpg", ".jpeg", ".png", ".webp"}:
+                parsed_students = parse_students_from_image_file(
+                    model, temp_path, template_fields=template_fields
+                )
+            elif ext == ".pdf":
+                parsed_students = parse_students_from_pdf(
+                    model, temp_path, template_fields=template_fields
+                )
+            elif ext in {".csv", ".xlsx", ".xls"}:
+                parsed_students = read_spreadsheet_students(
+                    temp_path, uploaded_file.filename, template_fields=template_fields
+                )
+            file_summaries.append({
+                "file": uploaded_file.filename,
+                "students_extracted": len(parsed_students),
+                "status": "processed"
+            })
+            all_students.extend(parsed_students)
+        all_students = dedupe_students(all_students)
+        validated_students, validation_errors = validate_student_records(all_students)
+        valid_students = [s for s in validated_students if s["_valid"]]
+        invalid_students = [s for s in validated_students if not s["_valid"]]
+        return jsonify({
+            "students": validated_students,
+            "summary": {
+                "files_received": len(uploaded_files),
+                "files_processed": len([x for x in file_summaries if x["status"] == "processed"]),
+                "total_students_extracted": len(all_students),
+                "valid_students": len(valid_students),
+                "invalid_students": len(invalid_students)
+            },
+            "file_summaries": file_summaries,
+            "validation_errors": validation_errors
+        })
+    except Exception as e:
+        logging.error(f"Student import server error: {e}")
+        return jsonify({"error": str(e)}), 500
+    finally:
+        for path in temp_paths:
+            try:
+                if path and os.path.exists(path):
+                    os.remove(path)
+            except Exception:
+                pass
+@app.route('/api/customers/validate-students-import', methods=['POST'])
+def validate_students_import():
+    """
+    Accepts already-parsed student rows from the preview table.
+    Useful before save.
+    """
+    try:
+        data = request.get_json(silent=True) or {}
+        students = data.get("students", [])
+        if not isinstance(students, list):
+            return jsonify({"error": "students must be an array"}), 400
+        normalized = [
+            normalize_student_record(student, template_fields={}, sequence=i + 1)
+            for i, student in enumerate(students)
+        ]
+        normalized = dedupe_students(normalized)
+        validated_students, validation_errors = validate_student_records(normalized)
+        return jsonify({
+            "students": validated_students,
+            "valid": len(validation_errors) == 0,
+            "validation_errors": validation_errors
+        })
+    except Exception as e:
+        logging.error(f"Student validation error: {e}")
+        return jsonify({"error": str(e)}), 500
+@app.route('/api/customers/parse-students-manual', methods=['POST'])
+def parse_students_manual():
+    """
+    For manual entry from UI.
+    Sends rows through the same normalization + validation pipeline.
+    """
+    try:
+        data = request.get_json(silent=True) or {}
+        students = data.get("students", [])
+        template_fields = data.get("template_fields", {}) or {}
+        if not isinstance(students, list):
+            return jsonify({"error": "students must be an array"}), 400
+        normalized = [
+            normalize_student_record(student, template_fields=template_fields, sequence=i + 1)
+            for i, student in enumerate(students)
+        ]
+        normalized = dedupe_students(normalized)
+        validated_students, validation_errors = validate_student_records(normalized)
+        return jsonify({
+            "students": validated_students,
+            "validation_errors": validation_errors
+        })
+    except Exception as e:
+        logging.error(f"Manual student parse error: {e}")
+        return jsonify({"error": str(e)}), 500
+# -------------------------------------------------------------------------
+# OTHER ENDPOINTS
+# -------------------------------------------------------------------------
 @app.route('/transaction-types', methods=['GET'])
 def get_transaction_types():
     """Return available transaction types and their categories."""
     transaction_types = {
         "types": [
             {
     return jsonify({
         'status': 'healthy',
         'timestamp': datetime.now().isoformat(),
+        'version': '2.3.0',
         'vision_support': PDF_IMAGE_SUPPORT
     })
 if __name__ == '__main__':
     app.run(debug=True, host="0.0.0.0", port=7860)