File size: 6,738 Bytes
2fb680d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfa102d
 
 
 
 
 
 
 
 
2fb680d
 
457c9e1
b7fb901
bfa102d
 
2fb680d
b7fb901
2fb680d
 
 
 
b7fb901
2fb680d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from llama_cpp import Llama
from typing import Generator, Optional, Dict, Any
import logging
import os
from huggingface_hub import hf_hub_download
import hashlib

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class CybersecurityLLM:
    def __init__(self,
                 repo_id: str = "daskalos-apps/phi4-cybersec-Q4_K_M",
                 filename: str = "phi4-mini-instruct-Q4_K_M.gguf",
                 local_dir: str = "./models",
                 force_download: bool = False):
        """
        Initialize Phi-4 from Hugging Face

        Args:
            repo_id: Your Hugging Face repository ID
            filename: The GGUF filename in the repository
            local_dir: Local directory to cache the model
            force_download: Force re-download even if cached
        """

        # Create local directory if it doesn't exist
        os.makedirs(local_dir, exist_ok=True)

        # Download model from Hugging Face
        logger.info(f"Loading model from Hugging Face: {repo_id}")

        try:
            model_path = hf_hub_download(
                repo_id=repo_id,
                filename=filename,
                local_dir=local_dir,
                local_dir_use_symlinks=False,
                force_download=force_download
            )
            logger.info(f"Model downloaded/cached at: {model_path}")
        except Exception as e:
            logger.error(f"Failed to download model: {e}")
            # Fallback to local file if exists
            model_path = os.path.join(local_dir, filename)
            if not os.path.exists(model_path):
                raise FileNotFoundError(f"Model not found locally or on Hugging Face: {repo_id}")

        # Initialize llama.cpp with the model
        logger.info("Initializing model...")

        # Check for GPU support via environment variable
        n_gpu_layers = int(os.getenv("N_GPU_LAYERS", "0"))

        if n_gpu_layers > 0:
            logger.info(f"GPU acceleration enabled: {n_gpu_layers} layers")
        else:
            logger.info("Running in CPU-only mode")

        self.llm = Llama(
            model_path=model_path,
            n_ctx=4096,  # Context window
            n_batch=512,  # Batch size for prompt processing
            n_threads=6 if n_gpu_layers == 0 else 4,  # Fewer threads needed with GPU
            n_gpu_layers=n_gpu_layers,  # GPU layers (0 for CPU-only)
            seed=-1,  # Random seed
            f16_kv=True,  # Use f16 for key/value cache (saves memory)
            logits_all=False,  # Only compute logits for last token
            vocab_only=False,  # Load full model
            use_mmap=True,  # Memory-map model for efficiency
            use_mlock=False,  # Don't lock model in RAM
            verbose=True  # Enable verbose for debugging
        )

        # Store model info
        self.model_info = {
            "repo_id": repo_id,
            "filename": filename,
            "path": model_path,
            "size_mb": os.path.getsize(model_path) / (1024 * 1024)
        }

        # Cybersecurity-focused system prompt
        self.system_prompt = """You are a cybersecurity expert assistant helping employees understand and implement security best practices. Your role is to provide clear, actionable guidance that non-technical users can understand and apply.

Core expertise areas:
• Email Security & Phishing Detection
• Password Management & Authentication
• Malware Prevention & Detection
• Safe Browsing & Download Practices
• Data Protection & Encryption
• Social Engineering Defense
• Remote Work Security
• Incident Response & Reporting
• Physical Security
• Mobile Device Security
• Cloud Security Basics
• Compliance Basics (GDPR, HIPAA, etc.)

Guidelines:
- Always prioritize user safety and security
- Provide step-by-step instructions when applicable
- Use simple language, avoid excessive jargon
- Include real-world examples
- Emphasize prevention over remediation
- Never ask users to disable security features
- If unsure, recommend consulting IT security team"""

        # Phi-4 uses ChatML format
        self.prompt_template = """<|system|>
{system}<|end|>
<|user|>
{user}<|end|>
<|assistant|>"""

        self.stop_tokens = ["<|end|>", "<|user|>", "<|endoftext|>", "<|assistant|>"]

        logger.info(f"Model ready! Size: {self.model_info['size_mb']:.2f} MB")

    def format_prompt(self, user_input: str, context: Optional[str] = None) -> str:
        """Format prompt with optional context for RAG"""
        if context:
            user_input = f"Context: {context}\n\nQuestion: {user_input}"

        return self.prompt_template.format(
            system=self.system_prompt,
            user=user_input
        )

    def generate(self,
                 prompt: str,
                 max_tokens: int = 512,
                 temperature: float = 0.7,
                 context: Optional[str] = None) -> Dict[str, Any]:
        """Generate response with metadata"""

        full_prompt = self.format_prompt(prompt, context)

        try:
            response = self.llm(
                full_prompt,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=0.95,
                top_k=40,
                repeat_penalty=1.1,
                stop=self.stop_tokens,
                echo=False
            )

            text = response['choices'][0]['text'].strip()

            return {
                "response": text,
                "tokens_used": response['usage']['total_tokens'],
                "model": self.model_info['repo_id']
            }

        except Exception as e:
            logger.error(f"Generation error: {e}")
            return {
                "response": "I apologize, but I encountered an error. Please try rephrasing your question.",
                "error": str(e)
            }

    def generate_stream(self,
                        prompt: str,
                        max_tokens: int = 512,
                        context: Optional[str] = None) -> Generator:
        """Stream response tokens"""

        full_prompt = self.format_prompt(prompt, context)

        stream = self.llm(
            full_prompt,
            max_tokens=max_tokens,
            temperature=0.7,
            top_p=0.95,
            top_k=40,
            repeat_penalty=1.1,
            stop=self.stop_tokens,
            echo=False,
            stream=True
        )

        for output in stream:
            token = output['choices'][0].get('text', '')
            if token:
                yield token

    def get_model_info(self) -> Dict[str, Any]:
        """Get information about the loaded model"""
        return self.model_info