{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "dd5f3f76-aba8-48f3-85f4-47051339472d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "a9f7422a-7189-467c-8b98-47003609c329", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/ranjit/llm-mail-trainer/venv/lib/python3.9/site-packages/urllib3/__init__.py:35: NotOpenSSLWarning: urllib3 v2 only supports OpenSSL 1.1.1+, currently the 'ssl' module is compiled with 'LibreSSL 2.8.3'. See: https://github.com/urllib3/urllib3/issues/3020\n", " warnings.warn(\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "MBOX file exists: True\n", "File size: 2.94 GB\n", "Total emails in mailbox: 41,948\n", "===EMAIL KEYS===\n", "['X-GM-THRID', 'X-Gmail-Labels', 'Delivered-To', 'Received', 'X-Google-Smtp-Source', 'X-Received', 'ARC-Seal', 'ARC-Message-Signature', 'ARC-Authentication-Results', 'Return-Path', 'Received', 'Received-SPF', 'Authentication-Results', 'DKIM-Signature', 'DKIM-Signature', 'Received', 'Received', 'Content-Transfer-Encoding', 'Content-Type', 'Date', 'From', 'Mime-Version', 'Message-ID', 'Subject', 'Reply-To', 'Feedback-ID', 'List-Unsubscribe', 'List-Unsubscribe-Post', 'x-campaignid', 'X-SG-EID', 'X-SG-ID', 'To', 'X-Entity-ID']\n", "\n", "=== SUBJECT ===\n", "Update: Your secret santa is here\n", "\n", "=== FROM ===\n", "Internshala Trainings \n", "\n", "=== DATE ===\n", "Thu, 25 Dec 2025 12:02:27 +0000 (UTC)\n", "\n", "=== CONTENT TYPE ===\n", "text/html\n", "Body length: 273 characters\n", "\n", "=== FIRST 500 CHARS ===\n", "Internshala Trainings Internshala (Scholiverse Educare Pvt. Ltd.) 901A and 901B, Iris Tech Park, Sector - 48, Sohna Road, Gurugram Don't want learning opportunities delivered to your inbox? Unsubscribe If you'd like to unsubscribe and stop receiving these emails click here\n", "Before cleaning: 273 chars\n", "After cleaning: 273 chars\n", "\n", "=== CLEANED TEXT ===\n", "Internshala Trainings Internshala (Scholiverse Educare Pvt. Ltd.) 901A and 901B, Iris Tech Park, Sector - 48, Sohna Road, Gurugram Don't want learning opportunities delivered to your inbox? Unsubscribe If you'd like to unsubscribe and stop receiving these emails click here\n", "=== DECODED SUBJECT ===\n", "πŸ“ˆΒ Dear Ranjit Behera, diversify beyond just savings\n", "\n", "=== DECODED FROM ===\n", "\"HDFC Bank\" \n", "=== PARSED EMAIL ===\n", "subject: πŸ“ˆ Dear Ranjit Behera, diversify beyond just savings\n", "sender: \"HDFC Bank\"\n", "date: Tue, 23 Dec 2025 17:32:07 +0530\n", "body: SmartWealth helps you invest smarter ---------------------------------------------------------------------------- To view this message in HTML format, click here: or paste this link in a Web browser -...\n", "Parsing all the mails ...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "7b19a8ddce014ce4b16d2321c92fcf2a", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Processing: 0%| | 0/41948 [00:00\", \"confidence\": \"\", \"reason\": \"\"}}\n", "\"\"\"\n", "\n", "def build_prompt(email_data):\n", " \"\"\"Build classification prompt for one email.\"\"\"\n", " return CLASSIFICATION_PROMPT.format(\n", " subject=email_data['subject'][:200],\n", " sender=email_data['sender'][:100],\n", " body=email_data['body'][:2000]\n", " )\n", "\n", "# Test: See what prompt looks like\n", "test_prompt = build_prompt(sample_emails[0])\n", "print(f\"Prompt length: {len(test_prompt)} characters\")\n", "print(f\"\\n=== PROMPT PREVIEW ===\\n{test_prompt[:1000]}...\")\n", "\n", "# Cell 14: Load Phi-3 model\n", "model_path = str(PROJECT / \"models/base/phi3-mini\")\n", "\n", "print(\"Loading Phi-3 model...\")\n", "model, tokenizer = load(model_path)\n", "print(\"βœ… Model loaded\")\n", "\n", "# Cell 15: Test classification on one email\n", "test_email = sample_emails[0]\n", "\n", "# Build prompt\n", "prompt = build_prompt(test_email)\n", "\n", "# Send to Phi-3\n", "print(\"Classifying email...\")\n", "print(f\"Subject: {test_email['subject'][:80]}...\")\n", "print(\"-\" * 50)\n", "\n", "response = generate(\n", " model, \n", " tokenizer, \n", " prompt=prompt,\n", " max_tokens=100,\n", " verbose=False\n", ")\n", "\n", "print(f\"\\n=== PHI-3 RESPONSE ===\\n{response}\")\n", "\n", "# Cell 16: JSON extraction helper\n", "\n", "def extract_json(response):\n", " \"\"\"Extract JSON object from LLM response.\"\"\"\n", "\n", " # Find JSON pattern in response\n", " match = re.search(r'\\{[^{}]*\\}', response)\n", "\n", " if(match):\n", " try:\n", " return json.loads(match.group())\n", " except json.JSONDecodeError:\n", " return None\n", " return None\n", "\n", "# Test on previous response\n", "parsed = extract_json(response)\n", "\n", "print(\"=== EXTRACTED JSON ===\")\n", "print(parsed)\n", "print(f\"\\nCategory: {parsed['category']}\")\n", "print(f\"Confidence: {parsed['confidence']}\")\n", "print(f\"Reason: {parsed['reason']}\")\n", "\n", "# Cell 17: Classify all sample emails\n", "results = []\n", "failed = 0\n", "\n", "print(f\"Classifying {len(sample_emails)} emails...\")\n", "print(\"Estimated time: ~5 minutes\\n\")\n", "\n", "start_time = time.time()\n", "\n", "for i, email_data in enumerate(tqdm(sample_emails, desc=\"Classifying\")):\n", " try:\n", " # Build prompt\n", " prompt = build_prompt(email_data)\n", " \n", " # Get classification\n", " response = generate(\n", " model, \n", " tokenizer, \n", " prompt=prompt,\n", " max_tokens=100,\n", " verbose=False\n", " )\n", " \n", " # Extract JSON\n", " parsed = extract_json(response)\n", " \n", " if parsed:\n", " results.append({\n", " 'id': email_data.get('id', i),\n", " 'subject': email_data['subject'],\n", " 'sender': email_data['sender'],\n", " 'category': parsed.get('category', 'other'),\n", " 'confidence': parsed.get('confidence', 'low'),\n", " 'reason': parsed.get('reason', '')\n", " })\n", " else:\n", " failed += 1\n", " \n", " except Exception as e:\n", " failed += 1\n", " continue\n", "\n", "elapsed = time.time() - start_time\n", "\n", "print(f\"\\nβœ… Classified: {len(results)}\")\n", "print(f\"❌ Failed: {failed}\")\n", "print(f\"⏱️ Time: {elapsed/60:.1f} minutes\")\n", "print(f\"⚑ Speed: {len(results)/elapsed:.1f} emails/sec\")\n", "\n", "# Cell 18: Category distribution\n", "\n", "categories = Counter([r['category'] for r in results])\n", "\n", "print(\"=== CATEGORY DISTRIBUTION ===\\n\")\n", "for category, count in categories.most_common():\n", " pct = count / len(results) * 100\n", " bar = \"β–ˆ\" * int(pct / 2)\n", " print(f\"{category:12} {count:4} ({pct:5.1f}%) {bar}\")\n", "\n", "print(f\"\\nπŸ“Š Total classified: {len(results)}\")\n", "\n", "# Cell 19: Save classification results\n", "results_path = PROJECT / \"data/parsed/classification_results.json\"\n", "\n", "with open(results_path, 'w', encoding='utf-8') as f:\n", " json.dump(results, f, ensure_ascii=False, indent=2)\n", "\n", "print(f\"βœ… Saved {len(results)} results to {results_path}\")\n", "\n", "# Cell 20: Extract finance emails\n", "\n", "finance_results = [r for r in results if r['category'] == 'finance']\n", "\n", "print(f\"=== FINANCE EMAILS: {len(finance_results)} ===\\n\")\n", "\n", "# Show first 10\n", "\n", "for i, email in enumerate(finance_results[:10]):\n", "\n", " print(f\"{i+1}. {email['subject'][:70]}\")\n", "\n", " print(f\" Sender: {email['sender'][:50]}\")\n", "\n", " print(f\" Reason: {email['reason']}\")\n", "\n", " print()\n", "\n", "# Cell 21: Get full details of finance emails\n", "finance_results = [r for r in results if r['category'] == 'finance']\n", "\n", "# Get full email data for finance emails\n", "finance_ids = [r['id'] for r in finance_results]\n", "finance_emails_full = [e for e in parsed_emails if e['id'] in finance_ids]\n", "\n", "print(f\"Finance emails with full body: {len(finance_emails_full)}\")\n", "\n", "# Show senders\n", "print(\"\\n=== FINANCE SENDERS ===\")\n", "senders = [e['sender'] for e in finance_emails_full]\n", "for sender, count in Counter(senders).most_common(15):\n", " print(f\" {sender[:50]:50} : {count}\")\n", "\n", " \n", "\n", "\n", "\n", " \n", " \n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "2c84628e-8702-4157-94ea-11dfee9aa2db", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.6" } }, "nbformat": 4, "nbformat_minor": 5 }