{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "cdb52cf3-6af7-4fe9-bcd7-ce3f21ad7835",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "✅ Loaded 40,820 emails\n",
      "✅ Loaded patterns:\n",
      "   Senders: ['icici', 'hdfc', 'groww', 'zerodha', 'paisabazaar', 'sbi', 'axis', 'kotak']\n",
      "   Keywords: ['debited', 'credited', 'transaction', 'upi', 'a/c', 'balance', 'payment']\n",
      "Total emails: 40,820\n",
      "Finance emails found: 8,116\n",
      "Percentage: 19.9%\n",
      "\n",
      "=== SAMPLE FINANCE EMAILS ===\n",
      "1. Inquiry from Robokits India\n",
      "2. 🍩 Doodles, donuts & data to close out 2025\n",
      "3. Daily Equity Margin Statement for HTV475 - December 23, 2025\n",
      "4. How Financially Ready Are You? Take the Quiz Now!\n",
      "5. 📈 Dear Ranjit Behera, diversify beyond just savings\n",
      "Transaction emails found: 598\n",
      "\n",
      "=== SAMPLE TRANSACTION EMAILS ===\n",
      "\n",
      "1. Subject: ❗ You have done a UPI txn. Check details!\n",
      "   Body: HDFC BANK Dear Customer, Rs.50000.00 has been debited from account 3545 to VPA subhashreebadatya250@okicici SUBHASHREE BADATYA on 22-12-25. Your UPI t...\n",
      "\n",
      "2. Subject: ❗ You have done a UPI txn. Check details!\n",
      "   Body: HDFC BANK Dear Customer, Rs.50000.00 has been debited from account 3545 to VPA subhashreebadatya250@okicici SUBHASHREE BADATYA on 22-12-25. Your UPI t...\n",
      "\n",
      "3. Subject: ❗ You have done a UPI txn. Check details!\n",
      "   Body: HDFC BANK Dear Customer, Rs.1936.00 has been debited from account 3545 to VPA macfoslimited.rzp@icici MACFOS LIMITED on 18-12-25. Your UPI transaction...\n",
      "\n",
      "4. Subject: ❗ You have done a UPI txn. Check details!\n",
      "   Body: HDFC BANK Dear Customer, Rs.610.00 has been debited from account 3545 to VPA robocraze.cf@axisbank RoboCraze on 18-12-25. Your UPI transaction referen...\n",
      "\n",
      "5. Subject: ❗ You have done a UPI txn. Check details!\n",
      "   Body: HDFC BANK Dear Customer, Rs.200.00 has been debited from account 3545 to VPA q071786333@ybl PRABAKARAN KRISHNAN on 24-12-25. Your UPI transaction refe...\n",
      "Subject: ❗ You have done a UPI txn. Check details!\n",
      "Extracted: {'amount': '50000.00', 'type': 'debit', 'account': '3545', 'date': '22-12-25', 'reference': '535680069988'}\n",
      "Training examples created: 360\n",
      "\n",
      "=== SAMPLE TRAINING EXAMPLE ===\n",
      "PROMPT:\n",
      "Extract financial entities from this email:\n",
      "\n",
      "Subject: ❗ You have done a UPI txn. Check details!\n",
      "\n",
      "Body: HDFC BANK Dear Customer, Rs.50000.00 has been debited from account 3545 to VPA subhashreebadatya250@okicici SUBHASHREE BADATYA on 22-12-25. Your UPI transaction reference number is 535680069988. If...\n",
      "\n",
      "COMPLETION:\n",
      "{\n",
      "  \"amount\": \"50000.00\",\n",
      "  \"type\": \"debit\",\n",
      "  \"account\": \"3545\",\n",
      "  \"date\": \"22-12-25\",\n",
      "  \"reference\": \"535680069988\"\n",
      "}\n",
      "Train set: 324\n",
      "Validation set: 36\n",
      "\n",
      "✅ Saved to /Users/ranjit/llm-mail-trainer/data/training/\n",
      "   train.jsonl (324 examples)\n",
      "   valid.jsonl (36 examples)\n",
      "=== FINE-TUNING SETUP ===\n",
      "Base model: /Users/ranjit/llm-mail-trainer/models/base/phi3-mini\n",
      "Training data: /Users/ranjit/llm-mail-trainer/data/training/train.jsonl\n",
      "Validation data: /Users/ranjit/llm-mail-trainer/data/training/valid.jsonl\n",
      "Output: /Users/ranjit/llm-mail-trainer/models/adapters/finance-lora\n",
      "\n",
      "⚠️ Fine-tuning will take 1-2 hours\n",
      "Run the following command in Terminal (not in notebook):\n",
      "\n",
      "cd /Users/ranjit/llm-mail-trainer\n",
      "source venv/bin/activate\n",
      "\n",
      "mlx_lm.lora \\\n",
      "    --model models/base/phi3-mini \\\n",
      "    --data data/training \\\n",
      "    --train \\\n",
      "    --batch-size 1 \\\n",
      "    --lora-layers 8 \\\n",
      "    --iters 500 \\\n",
      "    --adapter-path models/adapters/finance-lora\n",
      "\n",
      "Loading fine-tuned model...\n",
      "✅ Model loaded with LoRA adapter\n",
      "=== INPUT EMAIL ===\n",
      "\n",
      "Dear Customer, Rs.2500.00 has been debited from account 3545 to VPA swiggy@ybl \n",
      "for Swiggy order on 28-12-25. Your UPI transaction reference number is 534567891234. \n",
      "If you did not authorize this transaction, please call 1800-XXX-XXXX.\n",
      "\n",
      "\n",
      "=== MODEL OUTPUT ===\n",
      "\n",
      "\n",
      "**<|end|><|assistant|> {\n",
      "\n",
      "  \"amount\": \"2500\",\n",
      "\n",
      "  \"type\": \"debit\",\n",
      "\n",
      "  \"account\": \"3545\",\n",
      "\n",
      "  \"date\": \"28-12-25\",\n",
      "\n",
      "  \"reference\": \"534567891234\"\n",
      "\n",
      "}<|end|>\n",
      "=== INPUT EMAIL ===\n",
      "\n",
      "Dear Customer, Rs.45,000.00 has been credited to your account 7890 \n",
      "on 27-12-25. Salary from ACME CORP. Reference: NEFT123456789.\n",
      "Available balance: Rs.52,340.00\n",
      "\n",
      "\n",
      "=== MODEL OUTPUT ===\n",
      "For any order related inquiries, please call our helpline: 1800-100-000.\n",
      "- **credited** to account: (credited amount)\n",
      "- **credited** to (type of transaction): (type of transaction)\n",
      "- **credited** through (account type): (account type)\n",
      "- **credited** on (date): (credit date)\n",
      "- **credited** from (amount): (credited amount)\n",
      "- **credited** by (reference): (reference number)\n",
      "- **credited** (type of transaction): (type of transaction) (reference)\n",
      "- **credited** (account type): (account type) (type of transaction)\n",
      "- **credited** (date): (credit date) (type of transaction)\n",
      "- **credited** (amount): (credited amount) (type of transaction)\n",
      "- **credited\n",
      "=== INPUT ===\n",
      "Subject: Amount Credited to Your Account\n",
      "\n",
      "Dear Customer, Rs.45,000.00 has been credited to your account 7890 \n",
      "on 27-12-25. Salary from ACME CORP. Reference: NEFT123456789.\n",
      "Available balance: Rs.52,340.00\n",
      "\n",
      "=== MODEL OUTPUT ===\n",
      "This is in addition to your standing balance of Rs.1,000.00. For any order related queries, please reach us at .<|end|><|assistant|> {\n",
      "  \"amount\": \"45000\",\n",
      "  \"type\": \"credit\"\n",
      "}<|end|>\n",
      "To merge the model, run this in Terminal:\n",
      "\n",
      "cd /Users/ranjit/llm-mail-trainer\n",
      "source venv/bin/activate\n",
      "\n",
      "mlx_lm.fuse \\\n",
      "    --model models/base/phi3-mini \\\n",
      "    --adapter-path models/adapters/finance-lora \\\n",
      "    --save-path models/merged/finance-llm\n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Cell 1: Imports & Load Data\n",
    "import json\n",
    "import re\n",
    "import random\n",
    "from pathlib import Path\n",
    "from collections import Counter\n",
    "from mlx_lm import load, generate\n",
    "\n",
    "PROJECT = Path.home() / \"llm-mail-trainer\"\n",
    "\n",
    "# Load all parsed emails (40k)\n",
    "with open(PROJECT / \"data/parsed/emails.json\", 'r') as f:\n",
    "    all_emails = json.load(f)\n",
    "\n",
    "# Load discovered patterns\n",
    "with open(PROJECT / \"data/filtered/finance_patterns.json\", 'r') as f:\n",
    "    patterns = json.load(f)\n",
    "\n",
    "print(f\"✅ Loaded {len(all_emails):,} emails\")\n",
    "print(f\"✅ Loaded patterns:\")\n",
    "print(f\"   Senders: {patterns['finance_senders']}\")\n",
    "print(f\"   Keywords: {patterns['finance_keywords']}\")\n",
    "\n",
    "# Cell 2: Filter full dataset using discovered patterns\n",
    "def is_finance_email(email):\n",
    "    \"\"\"Check if email is finance-related using discovered patterns.\"\"\"\n",
    "    \n",
    "    sender = email['sender'].lower()\n",
    "    subject = email['subject'].lower()\n",
    "    body = email['body'].lower()\n",
    "    combined = f\"{sender} {subject} {body}\"\n",
    "    \n",
    "    # Check sender patterns\n",
    "    for s in patterns['finance_senders']:\n",
    "        if s in sender:\n",
    "            return True\n",
    "    \n",
    "    # Check keywords in subject or body\n",
    "    for kw in patterns['finance_keywords']:\n",
    "        if kw in combined:\n",
    "            return True\n",
    "    \n",
    "    return False\n",
    "\n",
    "# Apply filter to all emails\n",
    "finance_emails = [e for e in all_emails if is_finance_email(e)]\n",
    "\n",
    "print(f\"Total emails: {len(all_emails):,}\")\n",
    "print(f\"Finance emails found: {len(finance_emails):,}\")\n",
    "print(f\"Percentage: {len(finance_emails)/len(all_emails)*100:.1f}%\")\n",
    "\n",
    "# Show sample\n",
    "print(f\"\\n=== SAMPLE FINANCE EMAILS ===\")\n",
    "for i, email in enumerate(finance_emails[:5]):\n",
    "    print(f\"{i+1}. {email['subject'][:60]}\")\n",
    "\n",
    "# Cell 3: Stricter filter - transaction emails only\n",
    "def is_transaction_email(email):\n",
    "    \"\"\"Check if email contains actual transaction data.\"\"\"\n",
    "    \n",
    "    body = email['body'].lower()\n",
    "    subject = email['subject'].lower()\n",
    "    combined = f\"{subject} {body}\"\n",
    "    \n",
    "    # Must have transaction indicators\n",
    "    has_transaction = any(kw in combined for kw in ['debited', 'credited', 'payment of', 'transferred'])\n",
    "    \n",
    "    # Must have amount pattern (Rs. or ₹)\n",
    "    has_amount = bool(re.search(r'(?:rs\\.?|₹)\\s*[\\d,]+', combined))\n",
    "    \n",
    "    # Must have account reference\n",
    "    has_account = bool(re.search(r'(?:account|a/c|ac no)', combined))\n",
    "    \n",
    "    return has_transaction and has_amount\n",
    "\n",
    "# Apply stricter filter\n",
    "transaction_emails = [e for e in all_emails if is_transaction_email(e)]\n",
    "\n",
    "print(f\"Transaction emails found: {len(transaction_emails):,}\")\n",
    "\n",
    "# Show samples\n",
    "print(f\"\\n=== SAMPLE TRANSACTION EMAILS ===\")\n",
    "for i, email in enumerate(transaction_emails[:5]):\n",
    "    print(f\"\\n{i+1}. Subject: {email['subject'][:70]}\")\n",
    "    print(f\"   Body: {email['body'][:150]}...\")\n",
    "\n",
    "# Cell 4: Entity extraction function\n",
    "def extract_entities(text):\n",
    "    \"\"\"Extract financial entities from email text.\"\"\"\n",
    "    \n",
    "    entities = {}\n",
    "    \n",
    "    # Amount\n",
    "    amount_match = re.search(r'(?:Rs\\.?|₹)\\s*([\\d,]+(?:\\.\\d{2})?)', text)\n",
    "    if amount_match:\n",
    "        entities['amount'] = amount_match.group(1).replace(',', '')\n",
    "    \n",
    "    # Type\n",
    "    if 'debited' in text.lower():\n",
    "        entities['type'] = 'debit'\n",
    "    elif 'credited' in text.lower():\n",
    "        entities['type'] = 'credit'\n",
    "    \n",
    "    # Account\n",
    "    account_match = re.search(r'(?:account|A/C|a/c)\\s*[:\\s]?\\s*(\\w+)', text, re.IGNORECASE)\n",
    "    if account_match:\n",
    "        entities['account'] = account_match.group(1)\n",
    "    \n",
    "    # Date\n",
    "    date_match = re.search(r'(\\d{2}-\\d{2}-\\d{2,4})', text)\n",
    "    if date_match:\n",
    "        entities['date'] = date_match.group(1)\n",
    "    \n",
    "    # Reference\n",
    "    ref_match = re.search(r'reference\\s*(?:number|no\\.?)?\\s*(?:is)?\\s*(\\d+)', text, re.IGNORECASE)\n",
    "    if ref_match:\n",
    "        entities['reference'] = ref_match.group(1)\n",
    "    \n",
    "    return entities\n",
    "\n",
    "# Test\n",
    "sample = transaction_emails[0]\n",
    "print(f\"Subject: {sample['subject'][:60]}\")\n",
    "print(f\"Extracted: {extract_entities(sample['body'])}\")\n",
    "\n",
    "# Cell 5: Create training data for fine-tuning\n",
    "training_data = []\n",
    "\n",
    "for email in transaction_emails:\n",
    "    entities = extract_entities(email['body'])\n",
    "    \n",
    "    # Skip if no entities extracted\n",
    "    if len(entities) < 2:\n",
    "        continue\n",
    "    \n",
    "    # Create training example\n",
    "    example = {\n",
    "        \"prompt\": f\"Extract financial entities from this email:\\n\\nSubject: {email['subject']}\\n\\nBody: {email['body'][:1500]}\",\n",
    "        \"completion\": json.dumps(entities, indent=2)\n",
    "    }\n",
    "    \n",
    "    training_data.append(example)\n",
    "\n",
    "print(f\"Training examples created: {len(training_data)}\")\n",
    "\n",
    "# Show sample\n",
    "print(f\"\\n=== SAMPLE TRAINING EXAMPLE ===\")\n",
    "print(f\"PROMPT:\\n{training_data[0]['prompt'][:300]}...\")\n",
    "print(f\"\\nCOMPLETION:\\n{training_data[0]['completion']}\")\n",
    "\n",
    "# Cell 6: Split and save training data\n",
    "random.seed(42)\n",
    "random.shuffle(training_data)\n",
    "\n",
    "# Split: 90% train, 10% validation\n",
    "split_idx = int(len(training_data) * 0.9)\n",
    "train_data = training_data[:split_idx]\n",
    "valid_data = training_data[split_idx:]\n",
    "\n",
    "print(f\"Train set: {len(train_data)}\")\n",
    "print(f\"Validation set: {len(valid_data)}\")\n",
    "\n",
    "# Save as JSONL (required format for MLX)\n",
    "training_dir = PROJECT / \"data/training\"\n",
    "training_dir.mkdir(exist_ok=True)\n",
    "\n",
    "# Save train.jsonl\n",
    "with open(training_dir / \"train.jsonl\", 'w') as f:\n",
    "    for example in train_data:\n",
    "        f.write(json.dumps(example) + '\\n')\n",
    "\n",
    "# Save valid.jsonl\n",
    "with open(training_dir / \"valid.jsonl\", 'w') as f:\n",
    "    for example in valid_data:\n",
    "        f.write(json.dumps(example) + '\\n')\n",
    "\n",
    "print(f\"\\n✅ Saved to {training_dir}/\")\n",
    "print(f\"   train.jsonl ({len(train_data)} examples)\")\n",
    "print(f\"   valid.jsonl ({len(valid_data)} examples)\")\n",
    "\n",
    "# Cell 7: Fine-tune configuration\n",
    "print(\"=== FINE-TUNING SETUP ===\")\n",
    "print(f\"Base model: {PROJECT / 'models/base/phi3-mini'}\")\n",
    "print(f\"Training data: {PROJECT / 'data/training/train.jsonl'}\")\n",
    "print(f\"Validation data: {PROJECT / 'data/training/valid.jsonl'}\")\n",
    "print(f\"Output: {PROJECT / 'models/adapters/finance-lora'}\")\n",
    "\n",
    "print(\"\\n⚠️ Fine-tuning will take 1-2 hours\")\n",
    "print(\"Run the following command in Terminal (not in notebook):\")\n",
    "\n",
    "command = f\"\"\"\n",
    "cd {PROJECT}\n",
    "source venv/bin/activate\n",
    "\n",
    "mlx_lm.lora \\\\\n",
    "    --model models/base/phi3-mini \\\\\n",
    "    --data data/training \\\\\n",
    "    --train \\\\\n",
    "    --batch-size 1 \\\\\n",
    "    --lora-layers 8 \\\\\n",
    "    --iters 500 \\\\\n",
    "    --adapter-path models/adapters/finance-lora\n",
    "\"\"\"\n",
    "\n",
    "print(command)\n",
    "\n",
    "# Cell 8: Test fine-tuned model\n",
    "\n",
    "\n",
    "# Load base model with fine-tuned adapter\n",
    "model_path = str(PROJECT / \"models/base/phi3-mini\")\n",
    "adapter_path = str(PROJECT / \"models/adapters/finance-lora\")\n",
    "\n",
    "print(\"Loading fine-tuned model...\")\n",
    "model, tokenizer = load(model_path, adapter_path=adapter_path)\n",
    "print(\"✅ Model loaded with LoRA adapter\")\n",
    "\n",
    "# Cell 9: Test entity extraction on sample email\n",
    "test_email = \"\"\"\n",
    "Dear Customer, Rs.2500.00 has been debited from account 3545 to VPA swiggy@ybl \n",
    "for Swiggy order on 28-12-25. Your UPI transaction reference number is 534567891234. \n",
    "If you did not authorize this transaction, please call 1800-XXX-XXXX.\n",
    "\"\"\"\n",
    "\n",
    "prompt = f\"Extract financial entities from this email:\\n\\n{test_email}\"\n",
    "\n",
    "print(\"=== INPUT EMAIL ===\")\n",
    "print(test_email)\n",
    "print(\"\\n=== MODEL OUTPUT ===\")\n",
    "\n",
    "response = generate(\n",
    "    model,\n",
    "    tokenizer,\n",
    "    prompt=prompt,\n",
    "    max_tokens=200,\n",
    "    verbose=False\n",
    ")\n",
    "\n",
    "print(response)\n",
    "\n",
    "# Cell 10: Test on credit transaction\n",
    "test_email_2 = \"\"\"\n",
    "Dear Customer, Rs.45,000.00 has been credited to your account 7890 \n",
    "on 27-12-25. Salary from ACME CORP. Reference: NEFT123456789.\n",
    "Available balance: Rs.52,340.00\n",
    "\"\"\"\n",
    "\n",
    "prompt = f\"Extract financial entities from this email:\\n\\n{test_email_2}\"\n",
    "\n",
    "print(\"=== INPUT EMAIL ===\")\n",
    "print(test_email_2)\n",
    "print(\"\\n=== MODEL OUTPUT ===\")\n",
    "\n",
    "response = generate(\n",
    "    model,\n",
    "    tokenizer,\n",
    "    prompt=prompt,\n",
    "    max_tokens=200,\n",
    "    verbose=False\n",
    ")\n",
    "\n",
    "print(response)\n",
    "\n",
    "# Cell 11: Test credit transaction with better prompt\n",
    "test_email_credit = \"\"\"Subject: Amount Credited to Your Account\n",
    "\n",
    "Dear Customer, Rs.45,000.00 has been credited to your account 7890 \n",
    "on 27-12-25. Salary from ACME CORP. Reference: NEFT123456789.\n",
    "Available balance: Rs.52,340.00\"\"\"\n",
    "\n",
    "prompt = f\"\"\"Extract financial entities from this email:\n",
    "\n",
    "Subject: Amount Credited to Your Account\n",
    "\n",
    "Body: {test_email_credit}\"\"\"\n",
    "\n",
    "print(\"=== INPUT ===\")\n",
    "print(test_email_credit)\n",
    "print(\"\\n=== MODEL OUTPUT ===\")\n",
    "\n",
    "response = generate(\n",
    "    model,\n",
    "    tokenizer,\n",
    "    prompt=prompt,\n",
    "    max_tokens=200,\n",
    "    verbose=False\n",
    ")\n",
    "\n",
    "print(response)\n",
    "\n",
    "# Cell 12: Merge LoRA adapter with base model\n",
    "import shutil\n",
    "\n",
    "# Create merged model directory\n",
    "merged_path = PROJECT / \"models/merged/finance-llm\"\n",
    "merged_path.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "print(\"To merge the model, run this in Terminal:\")\n",
    "print()\n",
    "merge_command = f\"\"\"cd {PROJECT}\n",
    "source venv/bin/activate\n",
    "\n",
    "mlx_lm.fuse \\\\\n",
    "    --model models/base/phi3-mini \\\\\n",
    "    --adapter-path models/adapters/finance-lora \\\\\n",
    "    --save-path models/merged/finance-llm\n",
    "\"\"\"\n",
    "print(merge_command)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ef29ad32-346f-477c-9bc2-d5ac87c79da1",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}