{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# šŸ¤– AI Agent Comprehensive Training Notebook\n", "\n", "## Real-Time Cyber Forge Agentic AI Platform\n", "\n", "This notebook trains an AI agent with:\n", "1. **Communication Skills** - Natural language processing and context understanding\n", "2. **Cybersecurity Expertise** - Threat detection and vulnerability analysis\n", "3. **Web Scraping Capabilities** - Intelligence gathering and IOC extraction\n", "4. **Real-time Integration** - Desktop and mobile app connectivity\n", "\n", "**Author:** Cyber Forge AI Team\n", "**Date:** 2024\n", "\n", "---\n", "\n", "### šŸŽÆ Training Objectives:\n", "- Build conversational AI for cybersecurity communication\n", "- Train threat detection models with high accuracy\n", "- Implement web scraping for threat intelligence\n", "- Create real-time monitoring capabilities\n", "- Deploy models for production integration" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## šŸ“¦ Package Installation and Setup\n", "\n", "First, let's install all required packages for the AI agent training." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "šŸš€ Installing required packages...\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "āœ… Installed tensorflow>=2.13.0\n", "āœ… Installed transformers>=4.30.0\n", "āœ… Installed transformers>=4.30.0\n", "āœ… Installed torch>=2.0.0\n", "āœ… Installed torch>=2.0.0\n", "āœ… Installed scikit-learn>=1.3.0\n", "āœ… Installed scikit-learn>=1.3.0\n", "āœ… Installed pandas>=2.0.0\n", "āœ… Installed pandas>=2.0.0\n", "āœ… Installed numpy>=1.24.0\n", "āœ… Installed numpy>=1.24.0\n", "āœ… Installed matplotlib>=3.7.0\n", "āœ… Installed matplotlib>=3.7.0\n", "āœ… Installed seaborn>=0.12.0\n", "āœ… Installed seaborn>=0.12.0\n", "āœ… Installed nltk>=3.8.0\n", "āœ… Installed nltk>=3.8.0\n", "āœ… Installed spacy>=3.6.0\n", "āœ… Installed spacy>=3.6.0\n", "āœ… Installed beautifulsoup4>=4.12.0\n", "āœ… Installed beautifulsoup4>=4.12.0\n", "āœ… Installed requests>=2.31.0\n", "āœ… Installed requests>=2.31.0\n", "āœ… Installed selenium>=4.10.0\n", "āœ… Installed selenium>=4.10.0\n", "āœ… Installed openai>=0.27.0\n", "āœ… Installed openai>=0.27.0\n", "āœ… Installed chromadb>=0.4.0\n", "āœ… Installed chromadb>=0.4.0\n", "āœ… Installed joblib>=1.3.0\n", "šŸŽÆ Package installation completed!\n", "āœ… Installed joblib>=1.3.0\n", "šŸŽÆ Package installation completed!\n" ] } ], "source": [ "# Install required packages\n", "import subprocess\n", "import sys\n", "\n", "def install_package(package):\n", " subprocess.check_call([sys.executable, \"-m\", \"pip\", \"install\", package])\n", "\n", "# Core packages for AI training\n", "required_packages = [\n", " 'tensorflow>=2.13.0',\n", " 'transformers>=4.30.0',\n", " 'torch>=2.0.0',\n", " 'scikit-learn>=1.3.0',\n", " 'pandas>=2.0.0',\n", " 'numpy>=1.24.0',\n", " 'matplotlib>=3.7.0',\n", " 'seaborn>=0.12.0',\n", " 'nltk>=3.8.0',\n", " 'spacy>=3.6.0',\n", " 'beautifulsoup4>=4.12.0',\n", " 'requests>=2.31.0',\n", " 'selenium>=4.10.0',\n", " 'openai>=0.27.0',\n", " 'chromadb>=0.4.0',\n", " 'joblib>=1.3.0'\n", "]\n", "\n", "print(\"šŸš€ Installing required packages...\")\n", "for package in required_packages:\n", " try:\n", " install_package(package)\n", " print(f\"āœ… Installed {package}\")\n", " except Exception as e:\n", " print(f\"āŒ Failed to install {package}: {e}\")\n", "\n", "print(\"šŸŽÆ Package installation completed!\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## šŸ—£ļø Part 1: Communication Skills Training\n", "\n", "Training the AI agent to communicate effectively about cybersecurity topics." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "āœ… Created communication dataset with 30 examples\n", "šŸ“Š Context distribution: {'threat_detection': 6, 'user_education': 6, 'incident_response': 6, 'security_briefing': 6, 'emergency_response': 6}\n", "\n", "šŸ“‹ Sample data:\n", " context input \\\n", "0 threat_detection We detected a potential malware on your system \n", "1 threat_detection Variation 1: We detected a potential malware o... \n", "2 threat_detection Variation 2: We detected a potential malware o... \n", "\n", " tone \n", "0 professional_reassuring \n", "1 professional_reassuring \n", "2 professional_reassuring \n", " context input \\\n", "0 threat_detection We detected a potential malware on your system \n", "1 threat_detection Variation 1: We detected a potential malware o... \n", "2 threat_detection Variation 2: We detected a potential malware o... \n", "\n", " tone \n", "0 professional_reassuring \n", "1 professional_reassuring \n", "2 professional_reassuring \n" ] } ], "source": [ "# Import libraries for communication training\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.preprocessing import LabelEncoder\n", "import joblib\n", "import os\n", "\n", "# Create communication training dataset\n", "communication_data = [\n", " {\n", " \"context\": \"threat_detection\",\n", " \"input\": \"We detected a potential malware on your system\",\n", " \"response\": \"I understand your concern. Let me explain what we found and the recommended actions to secure your system.\",\n", " \"tone\": \"professional_reassuring\"\n", " },\n", " {\n", " \"context\": \"user_education\",\n", " \"input\": \"What is phishing?\",\n", " \"response\": \"Phishing is a cybersecurity attack where criminals impersonate legitimate organizations to steal sensitive information like passwords or credit card numbers.\",\n", " \"tone\": \"educational_clear\"\n", " },\n", " {\n", " \"context\": \"incident_response\",\n", " \"input\": \"My computer is acting strange and slow\",\n", " \"response\": \"That could indicate a security issue. Let's investigate this step by step. First, can you tell me when you first noticed these symptoms?\",\n", " \"tone\": \"helpful_diagnostic\"\n", " },\n", " {\n", " \"context\": \"security_briefing\",\n", " \"input\": \"Can you explain our security status?\",\n", " \"response\": \"Based on our latest analysis, your network shows good security health with no critical threats detected. I've identified a few areas for improvement that I'll detail for you.\",\n", " \"tone\": \"informative_confident\"\n", " },\n", " {\n", " \"context\": \"emergency_response\",\n", " \"input\": \"URGENT: Security breach detected!\", # Added missing input field\n", " \"response\": \"I understand this is urgent. I'm immediately analyzing your network traffic and will provide you with a real-time security assessment and response plan.\",\n", " \"tone\": \"calm_urgent\"\n", " }\n", "]\n", "\n", "# Expand dataset with variations (with better error handling)\n", "expanded_data = []\n", "for item in communication_data:\n", " expanded_data.append(item)\n", " # Add variations with different contexts - only if input exists\n", " if 'input' in item:\n", " for i in range(5):\n", " variation = item.copy()\n", " variation['input'] = f\"Variation {i+1}: {item['input']}\"\n", " expanded_data.append(variation)\n", " else:\n", " print(f\"āš ļø Warning: Item missing 'input' field: {item.get('context', 'Unknown')}\")\n", "\n", "df = pd.DataFrame(expanded_data)\n", "print(f\"āœ… Created communication dataset with {len(df)} examples\")\n", "print(f\"šŸ“Š Context distribution: {df['context'].value_counts().to_dict()}\")\n", "\n", "# Display sample data\n", "print(f\"\\nšŸ“‹ Sample data:\")\n", "print(df[['context', 'input', 'tone']].head(3))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "šŸŽÆ Training communication classifier...\n", "āœ… Communication models trained and saved!\n", "šŸ“ Models saved in: ../models/communication/\n", "āœ… Communication models trained and saved!\n", "šŸ“ Models saved in: ../models/communication/\n" ] } ], "source": [ "# Train communication models\n", "print(\"šŸŽÆ Training communication classifier...\")\n", "\n", "# Prepare features\n", "vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')\n", "X = vectorizer.fit_transform(df['input'])\n", "\n", "# Encode labels\n", "context_encoder = LabelEncoder()\n", "tone_encoder = LabelEncoder()\n", "\n", "y_context = context_encoder.fit_transform(df['context'])\n", "y_tone = tone_encoder.fit_transform(df['tone'])\n", "\n", "# Train models\n", "context_model = RandomForestClassifier(n_estimators=100, random_state=42)\n", "tone_model = RandomForestClassifier(n_estimators=100, random_state=42)\n", "\n", "context_model.fit(X, y_context)\n", "tone_model.fit(X, y_tone)\n", "\n", "# Save models\n", "os.makedirs('../models/communication', exist_ok=True)\n", "joblib.dump(vectorizer, '../models/communication/vectorizer.pkl')\n", "joblib.dump(context_model, '../models/communication/context_classifier.pkl')\n", "joblib.dump(tone_model, '../models/communication/tone_classifier.pkl')\n", "joblib.dump(context_encoder, '../models/communication/context_encoder.pkl')\n", "joblib.dump(tone_encoder, '../models/communication/tone_encoder.pkl')\n", "\n", "print(\"āœ… Communication models trained and saved!\")\n", "print(f\"šŸ“ Models saved in: ../models/communication/\")" ] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.15.0" } }, "nbformat": 4, "nbformat_minor": 4 }