blacksinisterx's picture
fix: keyframe images, video clips, evidence images, live stream webcam+URL, remove demo mode
fd50325 verified
"""
Local LLM Engine for Report Generation
Handles loading and inference with local GGUF models using llama-cpp-python.
Supports both Qwen2.5-3B-Instruct and Phi-3-mini as fallback.
"""
import os
import logging
from typing import Optional, Dict, Any, List
from pathlib import Path
from .config import ReportConfig, LLMConfig
logger = logging.getLogger(__name__)
class LLMEngine:
"""
Local LLM engine using llama-cpp-python.
Provides deterministic, instruction-following text generation
for structured report content.
"""
def __init__(self, config: Optional[ReportConfig] = None):
"""
Initialize the LLM engine.
Args:
config: Report configuration (uses default if None)
"""
self.config = config or ReportConfig()
self.llm_config = self.config.llm
self.model = None
self._is_loaded = False
self._model_type = None # 'qwen' or 'phi'
@property
def is_loaded(self) -> bool:
"""Check if model is loaded."""
return self._is_loaded
def download_model(self, use_alternative: bool = False) -> str:
"""
Download model from HuggingFace Hub.
Args:
use_alternative: If True, download Phi-3 instead of Qwen
Returns:
Path to downloaded model
"""
try:
from huggingface_hub import hf_hub_download
except ImportError:
raise ImportError(
"huggingface_hub is required to download models. "
"Install with: pip install huggingface_hub"
)
if use_alternative:
repo_id = self.llm_config.alt_hf_repo
filename = self.llm_config.alt_hf_filename
local_path = self.llm_config.alt_model_path
else:
repo_id = self.llm_config.hf_repo
filename = self.llm_config.hf_filename
local_path = self.llm_config.model_path
logger.info(f"Downloading model from {repo_id}/{filename}...")
# Download to models directory
downloaded_path = hf_hub_download(
repo_id=repo_id,
filename=filename,
local_dir=self.llm_config.models_dir,
local_dir_use_symlinks=False
)
logger.info(f"Model downloaded to: {downloaded_path}")
return downloaded_path
def load_model(self, force_reload: bool = False) -> bool:
"""
Load the LLM model into memory.
Args:
force_reload: Force reload even if already loaded
Returns:
True if successful, False otherwise
"""
if self._is_loaded and not force_reload:
logger.info("Model already loaded")
return True
try:
from llama_cpp import Llama
except ImportError:
logger.warning(
"llama-cpp-python is not installed — LLM report generation disabled. "
"Reports will use template-based fallback."
)
self._is_loaded = False
return False
# Try primary model first, then alternative
model_path = self.llm_config.model_path
if not os.path.exists(model_path):
model_path = self.llm_config.alt_model_path
self._model_type = 'phi'
else:
self._model_type = 'qwen'
if not os.path.exists(model_path):
logger.warning("No model found. Attempting to download...")
try:
model_path = self.download_model()
self._model_type = 'qwen'
except Exception as e:
logger.error(f"Failed to download model: {e}")
return False
logger.info(f"Loading model from: {model_path}")
logger.info(f"Model type: {self._model_type}")
try:
self.model = Llama(
model_path=model_path,
n_ctx=self.llm_config.n_ctx,
n_threads=self.llm_config.n_threads,
n_gpu_layers=self.llm_config.n_gpu_layers,
verbose=False
)
self._is_loaded = True
logger.info("✅ Model loaded successfully")
return True
except Exception as e:
logger.error(f"Failed to load model: {e}")
self._is_loaded = False
return False
def _format_prompt(self, system_prompt: str, user_prompt: str) -> str:
"""
Format prompt according to model's chat template.
Args:
system_prompt: System instructions
user_prompt: User's request
Returns:
Formatted prompt string
"""
if self._model_type == 'qwen':
# Qwen2.5 chat format
return f"""<|im_start|>system
{system_prompt}<|im_end|>
<|im_start|>user
{user_prompt}<|im_end|>
<|im_start|>assistant
"""
else:
# Phi-3 chat format
return f"""<|system|>
{system_prompt}<|end|>
<|user|>
{user_prompt}<|end|>
<|assistant|>
"""
def generate(
self,
system_prompt: str,
user_prompt: str,
max_tokens: Optional[int] = None,
temperature: Optional[float] = None,
stop_sequences: Optional[List[str]] = None
) -> Dict[str, Any]:
"""
Generate text using the loaded LLM.
Args:
system_prompt: System instructions for the model
user_prompt: The actual prompt/request
max_tokens: Override max tokens (uses config default if None)
temperature: Override temperature (uses config default if None)
stop_sequences: Custom stop sequences
Returns:
Dict with 'text', 'tokens_used', 'finish_reason'
"""
if not self._is_loaded:
if not self.load_model():
return {
'text': '',
'tokens_used': 0,
'finish_reason': 'error',
'error': 'Model not loaded'
}
# Format the prompt
formatted_prompt = self._format_prompt(system_prompt, user_prompt)
# Set parameters
max_tokens = max_tokens or self.llm_config.max_tokens
temperature = temperature or self.llm_config.temperature
# Default stop sequences based on model type
if stop_sequences is None:
if self._model_type == 'qwen':
stop_sequences = ["<|im_end|>", "<|im_start|>"]
else:
stop_sequences = ["<|end|>", "<|user|>"]
logger.debug(f"Generating with max_tokens={max_tokens}, temp={temperature}")
try:
output = self.model(
formatted_prompt,
max_tokens=max_tokens,
temperature=temperature,
top_p=self.llm_config.top_p,
repeat_penalty=self.llm_config.repeat_penalty,
stop=stop_sequences,
echo=False
)
generated_text = output['choices'][0]['text'].strip()
finish_reason = output['choices'][0].get('finish_reason', 'stop')
tokens_used = output.get('usage', {}).get('total_tokens', 0)
return {
'text': generated_text,
'tokens_used': tokens_used,
'finish_reason': finish_reason,
'error': None
}
except Exception as e:
logger.error(f"Generation error: {e}")
return {
'text': '',
'tokens_used': 0,
'finish_reason': 'error',
'error': str(e)
}
def generate_structured(
self,
system_prompt: str,
user_prompt: str,
output_format: str = 'markdown'
) -> Dict[str, Any]:
"""
Generate structured output (Markdown or JSON).
Args:
system_prompt: System instructions
user_prompt: User request
output_format: 'markdown' or 'json'
Returns:
Dict with generated content
"""
# Add format instructions to system prompt
if output_format == 'json':
format_instruction = "\nYou MUST respond with valid JSON only. No explanations outside the JSON."
else:
format_instruction = "\nYou MUST respond with properly formatted Markdown only."
enhanced_system = system_prompt + format_instruction
result = self.generate(enhanced_system, user_prompt)
# Parse JSON if requested
if output_format == 'json' and result['text']:
import json
try:
# Try to extract JSON from the response
text = result['text']
# Find JSON boundaries
start = text.find('{')
end = text.rfind('}') + 1
if start != -1 and end > start:
json_str = text[start:end]
result['parsed'] = json.loads(json_str)
else:
result['parsed'] = None
result['parse_error'] = 'No JSON object found in response'
except json.JSONDecodeError as e:
result['parsed'] = None
result['parse_error'] = str(e)
return result
def unload_model(self):
"""Unload model from memory."""
if self.model:
del self.model
self.model = None
self._is_loaded = False
self._model_type = None
logger.info("Model unloaded")
def get_model_info(self) -> Dict[str, Any]:
"""Get information about the loaded model."""
return {
'is_loaded': self._is_loaded,
'model_type': self._model_type,
'model_path': self.llm_config.model_path if self._model_type == 'qwen' else self.llm_config.alt_model_path,
'context_size': self.llm_config.n_ctx,
'gpu_layers': self.llm_config.n_gpu_layers,
'threads': self.llm_config.n_threads
}
# Singleton instance for reuse
_engine_instance: Optional[LLMEngine] = None
def get_llm_engine(config: Optional[ReportConfig] = None) -> LLMEngine:
"""
Get or create the LLM engine singleton.
Args:
config: Optional configuration override
Returns:
LLMEngine instance
"""
global _engine_instance
if _engine_instance is None or config is not None:
_engine_instance = LLMEngine(config)
return _engine_instance