File size: 7,734 Bytes

dcc24f8

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bfd5cfe5-ea7e-49d8-9ef3-5d43bef5a0cf",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "import json\n",
    "import re\n",
    "import random\n",
    "import time\n",
    "from pathlib import Path\n",
    "from tqdm.notebook import tqdm\n",
    "from mlx_lm import load, generate\n",
    "from collections import Counter\n",
    "\n",
    "\n",
    "# Cell 11: Load parsed emails from cache\n",
    "\n",
    "cache_path = PROJECT / \"data/parsed/emails.json\"\n",
    "\n",
    "with open(cache_path, 'r', encoding='utf-8') as f:\n",
    "    parsed_emails = json.load(f)\n",
    "\n",
    "print(f\"✅ Loaded {len(parsed_emails):,} emails from cache\")\n",
    "\n",
    "# Cell 12: Random sampling\n",
    "random.seed(42)\n",
    "\n",
    "# Pick 500 random emails\n",
    "sample_size = 500\n",
    "sample_emails = random.sample(parsed_emails,sample_size)\n",
    "\n",
    "print(f\"Total emails: {len(parsed_emails):,}\")\n",
    "print(f\"Sample size: {len(sample_emails)}\")\n",
    "\n",
    "# Preview one sample\n",
    "print(f\"\\n=== SAMPLE EMAIL #1 ===\")\n",
    "print(f\"Subject: {sample_emails[0]['subject']}\")\n",
    "print(f\"Sender: {sample_emails[0]['sender']}\")\n",
    "print(f\"Body: {sample_emails[0]['body'][:300]}...\")\n",
    "\n",
    "# Cell 13: Classification prompt template\n",
    "\n",
    "CLASSIFICATION_PROMPT = \"\"\"You are an email classifier. Analyze this email and categorize it.\n",
    "\n",
    "EMAIL:\n",
    "Subject: {subject}\n",
    "From: {sender}\n",
    "Body: {body}\n",
    "\n",
    "TASK:\n",
    "Classify this email into exactly ONE category.\n",
    "\n",
    "CATEGORIES:\n",
    "- finance: Banks, payments, transactions, investments, credit cards, loans, UPI, wallets\n",
    "- shopping: Orders, deliveries, purchases, e-commerce\n",
    "- social: Social networks, personal messages, invitations\n",
    "- work: Job-related, recruitment, office, meetings, projects\n",
    "- newsletter: Digests, subscriptions, blogs, articles\n",
    "- promotional: Marketing, offers, discounts, advertisements\n",
    "- other: Anything that doesn't fit above\n",
    "\n",
    "OUTPUT FORMAT (JSON only, no other text):\n",
    "{{\"category\": \"<category>\", \"confidence\": \"<high/medium/low>\", \"reason\": \"<brief 5-10 word reason>\"}}\n",
    "\"\"\"\n",
    "\n",
    "def build_prompt(email_data):\n",
    "    \"\"\"Build classification prompt for one email.\"\"\"\n",
    "    return CLASSIFICATION_PROMPT.format(\n",
    "        subject=email_data['subject'][:200],\n",
    "        sender=email_data['sender'][:100],\n",
    "        body=email_data['body'][:2000]\n",
    "    )\n",
    "\n",
    "# Test: See what prompt looks like\n",
    "test_prompt = build_prompt(sample_emails[0])\n",
    "print(f\"Prompt length: {len(test_prompt)} characters\")\n",
    "print(f\"\\n=== PROMPT PREVIEW ===\\n{test_prompt[:1000]}...\")\n",
    "\n",
    "# Cell 14: Load Phi-3 model\n",
    "model_path = str(PROJECT / \"models/base/phi3-mini\")\n",
    "\n",
    "print(\"Loading Phi-3 model...\")\n",
    "model, tokenizer = load(model_path)\n",
    "print(\"✅ Model loaded\")\n",
    "\n",
    "# Cell 15: Test classification on one email\n",
    "test_email = sample_emails[0]\n",
    "\n",
    "# Build prompt\n",
    "prompt = build_prompt(test_email)\n",
    "\n",
    "# Send to Phi-3\n",
    "print(\"Classifying email...\")\n",
    "print(f\"Subject: {test_email['subject'][:80]}...\")\n",
    "print(\"-\" * 50)\n",
    "\n",
    "response = generate(\n",
    "    model, \n",
    "    tokenizer, \n",
    "    prompt=prompt,\n",
    "    max_tokens=100,\n",
    "    verbose=False\n",
    ")\n",
    "\n",
    "print(f\"\\n=== PHI-3 RESPONSE ===\\n{response}\")\n",
    "\n",
    "# Cell 16: JSON extraction helper\n",
    "\n",
    "def extract_json(response):\n",
    "    \"\"\"Extract JSON object from LLM response.\"\"\"\n",
    "\n",
    "    # Find JSON pattern in response\n",
    "    match = re.search(r'\\{[^{}]*\\}', response)\n",
    "\n",
    "    if(match):\n",
    "          try:\n",
    "              return json.loads(match.group())\n",
    "          except json.JSONDecodeError:\n",
    "              return None\n",
    "    return None\n",
    "\n",
    "# Test on previous response\n",
    "parsed = extract_json(response)\n",
    "\n",
    "print(\"=== EXTRACTED JSON ===\")\n",
    "print(parsed)\n",
    "print(f\"\\nCategory: {parsed['category']}\")\n",
    "print(f\"Confidence: {parsed['confidence']}\")\n",
    "print(f\"Reason: {parsed['reason']}\")\n",
    "\n",
    "# Cell 17: Classify all sample emails\n",
    "results = []\n",
    "failed = 0\n",
    "\n",
    "print(f\"Classifying {len(sample_emails)} emails...\")\n",
    "print(\"Estimated time: ~5 minutes\\n\")\n",
    "\n",
    "start_time = time.time()\n",
    "\n",
    "for i, email_data in enumerate(tqdm(sample_emails, desc=\"Classifying\")):\n",
    "    try:\n",
    "        # Build prompt\n",
    "        prompt = build_prompt(email_data)\n",
    "        \n",
    "        # Get classification\n",
    "        response = generate(\n",
    "            model, \n",
    "            tokenizer, \n",
    "            prompt=prompt,\n",
    "            max_tokens=100,\n",
    "            verbose=False\n",
    "        )\n",
    "        \n",
    "        # Extract JSON\n",
    "        parsed = extract_json(response)\n",
    "        \n",
    "        if parsed:\n",
    "            results.append({\n",
    "                'id': email_data.get('id', i),\n",
    "                'subject': email_data['subject'],\n",
    "                'sender': email_data['sender'],\n",
    "                'category': parsed.get('category', 'other'),\n",
    "                'confidence': parsed.get('confidence', 'low'),\n",
    "                'reason': parsed.get('reason', '')\n",
    "            })\n",
    "        else:\n",
    "            failed += 1\n",
    "            \n",
    "    except Exception as e:\n",
    "        failed += 1\n",
    "        continue\n",
    "\n",
    "elapsed = time.time() - start_time\n",
    "\n",
    "print(f\"\\n✅ Classified: {len(results)}\")\n",
    "print(f\"❌ Failed: {failed}\")\n",
    "print(f\"⏱️ Time: {elapsed/60:.1f} minutes\")\n",
    "print(f\"⚡ Speed: {len(results)/elapsed:.1f} emails/sec\")\n",
    "\n",
    "# Cell 18: Category distribution\n",
    "\n",
    "categories = Counter([r['category'] for r in results])\n",
    "\n",
    "print(\"=== CATEGORY DISTRIBUTION ===\\n\")\n",
    "for category, count in categories.most_common():\n",
    "    pct = count / len(results) * 100\n",
    "    bar = \"█\" * int(pct / 2)\n",
    "    print(f\"{category:12} {count:4} ({pct:5.1f}%) {bar}\")\n",
    "\n",
    "print(f\"\\n📊 Total classified: {len(results)}\")\n",
    "\n",
    "# Cell 19: Save classification results\n",
    "results_path = PROJECT / \"data/parsed/classification_results.json\"\n",
    "\n",
    "with open(results_path, 'w', encoding='utf-8') as f:\n",
    "    json.dump(results, f, ensure_ascii=False, indent=2)\n",
    "\n",
    "print(f\"✅ Saved {len(results)} results to {results_path}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}