from llama_cpp import Llama from typing import Generator, Optional, Dict, Any import logging import os from huggingface_hub import hf_hub_download import hashlib logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class CybersecurityLLM: def __init__(self, repo_id: str = "daskalos-apps/phi4-cybersec-Q4_K_M", filename: str = "phi4-mini-instruct-Q4_K_M.gguf", local_dir: str = "./models", force_download: bool = False): """ Initialize Phi-4 from Hugging Face Args: repo_id: Your Hugging Face repository ID filename: The GGUF filename in the repository local_dir: Local directory to cache the model force_download: Force re-download even if cached """ # Create local directory if it doesn't exist os.makedirs(local_dir, exist_ok=True) # Download model from Hugging Face logger.info(f"Loading model from Hugging Face: {repo_id}") try: model_path = hf_hub_download( repo_id=repo_id, filename=filename, local_dir=local_dir, local_dir_use_symlinks=False, force_download=force_download ) logger.info(f"Model downloaded/cached at: {model_path}") except Exception as e: logger.error(f"Failed to download model: {e}") # Fallback to local file if exists model_path = os.path.join(local_dir, filename) if not os.path.exists(model_path): raise FileNotFoundError(f"Model not found locally or on Hugging Face: {repo_id}") # Initialize llama.cpp with the model logger.info("Initializing model...") # Check for GPU support via environment variable n_gpu_layers = int(os.getenv("N_GPU_LAYERS", "0")) if n_gpu_layers > 0: logger.info(f"GPU acceleration enabled: {n_gpu_layers} layers") else: logger.info("Running in CPU-only mode") self.llm = Llama( model_path=model_path, n_ctx=4096, # Context window n_batch=512, # Batch size for prompt processing n_threads=6 if n_gpu_layers == 0 else 4, # Fewer threads needed with GPU n_gpu_layers=n_gpu_layers, # GPU layers (0 for CPU-only) seed=-1, # Random seed f16_kv=True, # Use f16 for key/value cache (saves memory) logits_all=False, # Only compute logits for last token vocab_only=False, # Load full model use_mmap=True, # Memory-map model for efficiency use_mlock=False, # Don't lock model in RAM verbose=True # Enable verbose for debugging ) # Store model info self.model_info = { "repo_id": repo_id, "filename": filename, "path": model_path, "size_mb": os.path.getsize(model_path) / (1024 * 1024) } # Cybersecurity-focused system prompt self.system_prompt = """You are a cybersecurity expert assistant helping employees understand and implement security best practices. Your role is to provide clear, actionable guidance that non-technical users can understand and apply. Core expertise areas: • Email Security & Phishing Detection • Password Management & Authentication • Malware Prevention & Detection • Safe Browsing & Download Practices • Data Protection & Encryption • Social Engineering Defense • Remote Work Security • Incident Response & Reporting • Physical Security • Mobile Device Security • Cloud Security Basics • Compliance Basics (GDPR, HIPAA, etc.) Guidelines: - Always prioritize user safety and security - Provide step-by-step instructions when applicable - Use simple language, avoid excessive jargon - Include real-world examples - Emphasize prevention over remediation - Never ask users to disable security features - If unsure, recommend consulting IT security team""" # Phi-4 uses ChatML format self.prompt_template = """<|system|> {system}<|end|> <|user|> {user}<|end|> <|assistant|>""" self.stop_tokens = ["<|end|>", "<|user|>", "<|endoftext|>", "<|assistant|>"] logger.info(f"Model ready! Size: {self.model_info['size_mb']:.2f} MB") def format_prompt(self, user_input: str, context: Optional[str] = None) -> str: """Format prompt with optional context for RAG""" if context: user_input = f"Context: {context}\n\nQuestion: {user_input}" return self.prompt_template.format( system=self.system_prompt, user=user_input ) def generate(self, prompt: str, max_tokens: int = 512, temperature: float = 0.7, context: Optional[str] = None) -> Dict[str, Any]: """Generate response with metadata""" full_prompt = self.format_prompt(prompt, context) try: response = self.llm( full_prompt, max_tokens=max_tokens, temperature=temperature, top_p=0.95, top_k=40, repeat_penalty=1.1, stop=self.stop_tokens, echo=False ) text = response['choices'][0]['text'].strip() return { "response": text, "tokens_used": response['usage']['total_tokens'], "model": self.model_info['repo_id'] } except Exception as e: logger.error(f"Generation error: {e}") return { "response": "I apologize, but I encountered an error. Please try rephrasing your question.", "error": str(e) } def generate_stream(self, prompt: str, max_tokens: int = 512, context: Optional[str] = None) -> Generator: """Stream response tokens""" full_prompt = self.format_prompt(prompt, context) stream = self.llm( full_prompt, max_tokens=max_tokens, temperature=0.7, top_p=0.95, top_k=40, repeat_penalty=1.1, stop=self.stop_tokens, echo=False, stream=True ) for output in stream: token = output['choices'][0].get('text', '') if token: yield token def get_model_info(self) -> Dict[str, Any]: """Get information about the loaded model""" return self.model_info