Spaces:

blacksinisterx
/

DetectifAI-Backend

Running

App Files Files Community

DetectifAI-Backend / report_generation /llm_engine.py

blacksinisterx

fix: keyframe images, video clips, evidence images, live stream webcam+URL, remove demo mode

fd50325 verified 2 months ago

raw

history blame contribute delete

11.3 kB

	"""
	Local LLM Engine for Report Generation

	Handles loading and inference with local GGUF models using llama-cpp-python.
	Supports both Qwen2.5-3B-Instruct and Phi-3-mini as fallback.
	"""

	import os
	import logging
	from typing import Optional, Dict, Any, List
	from pathlib import Path

	from .config import ReportConfig, LLMConfig

	logger = logging.getLogger(__name__)


	class LLMEngine:
	"""
	Local LLM engine using llama-cpp-python.

	Provides deterministic, instruction-following text generation
	for structured report content.
	"""

	def __init__(self, config: Optional[ReportConfig] = None):
	"""
	Initialize the LLM engine.

	Args:
	config: Report configuration (uses default if None)
	"""
	self.config = config or ReportConfig()
	self.llm_config = self.config.llm
	self.model = None
	self._is_loaded = False
	self._model_type = None # 'qwen' or 'phi'

	@property
	def is_loaded(self) -> bool:
	"""Check if model is loaded."""
	return self._is_loaded

	def download_model(self, use_alternative: bool = False) -> str:
	"""
	Download model from HuggingFace Hub.

	Args:
	use_alternative: If True, download Phi-3 instead of Qwen

	Returns:
	Path to downloaded model
	"""
	try:
	from huggingface_hub import hf_hub_download
	except ImportError:
	raise ImportError(
	"huggingface_hub is required to download models. "
	"Install with: pip install huggingface_hub"
	)

	if use_alternative:
	repo_id = self.llm_config.alt_hf_repo
	filename = self.llm_config.alt_hf_filename
	local_path = self.llm_config.alt_model_path
	else:
	repo_id = self.llm_config.hf_repo
	filename = self.llm_config.hf_filename
	local_path = self.llm_config.model_path

	logger.info(f"Downloading model from {repo_id}/{filename}...")

	# Download to models directory
	downloaded_path = hf_hub_download(
	repo_id=repo_id,
	filename=filename,
	local_dir=self.llm_config.models_dir,
	local_dir_use_symlinks=False
	)

	logger.info(f"Model downloaded to: {downloaded_path}")
	return downloaded_path

	def load_model(self, force_reload: bool = False) -> bool:
	"""
	Load the LLM model into memory.

	Args:
	force_reload: Force reload even if already loaded

	Returns:
	True if successful, False otherwise
	"""
	if self._is_loaded and not force_reload:
	logger.info("Model already loaded")
	return True

	try:
	from llama_cpp import Llama
	except ImportError:
	logger.warning(
	"llama-cpp-python is not installed — LLM report generation disabled. "
	"Reports will use template-based fallback."
	)
	self._is_loaded = False
	return False

	# Try primary model first, then alternative
	model_path = self.llm_config.model_path
	if not os.path.exists(model_path):
	model_path = self.llm_config.alt_model_path
	self._model_type = 'phi'
	else:
	self._model_type = 'qwen'

	if not os.path.exists(model_path):
	logger.warning("No model found. Attempting to download...")
	try:
	model_path = self.download_model()
	self._model_type = 'qwen'
	except Exception as e:
	logger.error(f"Failed to download model: {e}")
	return False

	logger.info(f"Loading model from: {model_path}")
	logger.info(f"Model type: {self._model_type}")

	try:
	self.model = Llama(
	model_path=model_path,
	n_ctx=self.llm_config.n_ctx,
	n_threads=self.llm_config.n_threads,
	n_gpu_layers=self.llm_config.n_gpu_layers,
	verbose=False
	)
	self._is_loaded = True
	logger.info("✅ Model loaded successfully")
	return True

	except Exception as e:
	logger.error(f"Failed to load model: {e}")
	self._is_loaded = False
	return False

	def _format_prompt(self, system_prompt: str, user_prompt: str) -> str:
	"""
	Format prompt according to model's chat template.

	Args:
	system_prompt: System instructions
	user_prompt: User's request

	Returns:
	Formatted prompt string
	"""
	if self._model_type == 'qwen':
	# Qwen2.5 chat format
	return f"""<\|im_start\|>system
	{system_prompt}<\|im_end\|>
	<\|im_start\|>user
	{user_prompt}<\|im_end\|>
	<\|im_start\|>assistant
	"""
	else:
	# Phi-3 chat format
	return f"""<\|system\|>
	{system_prompt}<\|end\|>
	<\|user\|>
	{user_prompt}<\|end\|>
	<\|assistant\|>
	"""

	def generate(
	self,
	system_prompt: str,
	user_prompt: str,
	max_tokens: Optional[int] = None,
	temperature: Optional[float] = None,
	stop_sequences: Optional[List[str]] = None
	) -> Dict[str, Any]:
	"""
	Generate text using the loaded LLM.

	Args:
	system_prompt: System instructions for the model
	user_prompt: The actual prompt/request
	max_tokens: Override max tokens (uses config default if None)
	temperature: Override temperature (uses config default if None)
	stop_sequences: Custom stop sequences

	Returns:
	Dict with 'text', 'tokens_used', 'finish_reason'
	"""
	if not self._is_loaded:
	if not self.load_model():
	return {
	'text': '',
	'tokens_used': 0,
	'finish_reason': 'error',
	'error': 'Model not loaded'
	}

	# Format the prompt
	formatted_prompt = self._format_prompt(system_prompt, user_prompt)

	# Set parameters
	max_tokens = max_tokens or self.llm_config.max_tokens
	temperature = temperature or self.llm_config.temperature

	# Default stop sequences based on model type
	if stop_sequences is None:
	if self._model_type == 'qwen':
	stop_sequences = ["<\|im_end\|>", "<\|im_start\|>"]
	else:
	stop_sequences = ["<\|end\|>", "<\|user\|>"]

	logger.debug(f"Generating with max_tokens={max_tokens}, temp={temperature}")

	try:
	output = self.model(
	formatted_prompt,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=self.llm_config.top_p,
	repeat_penalty=self.llm_config.repeat_penalty,
	stop=stop_sequences,
	echo=False
	)

	generated_text = output['choices'][0]['text'].strip()
	finish_reason = output['choices'][0].get('finish_reason', 'stop')
	tokens_used = output.get('usage', {}).get('total_tokens', 0)

	return {
	'text': generated_text,
	'tokens_used': tokens_used,
	'finish_reason': finish_reason,
	'error': None
	}

	except Exception as e:
	logger.error(f"Generation error: {e}")
	return {
	'text': '',
	'tokens_used': 0,
	'finish_reason': 'error',
	'error': str(e)
	}

	def generate_structured(
	self,
	system_prompt: str,
	user_prompt: str,
	output_format: str = 'markdown'
	) -> Dict[str, Any]:
	"""
	Generate structured output (Markdown or JSON).

	Args:
	system_prompt: System instructions
	user_prompt: User request
	output_format: 'markdown' or 'json'

	Returns:
	Dict with generated content
	"""
	# Add format instructions to system prompt
	if output_format == 'json':
	format_instruction = "\nYou MUST respond with valid JSON only. No explanations outside the JSON."
	else:
	format_instruction = "\nYou MUST respond with properly formatted Markdown only."

	enhanced_system = system_prompt + format_instruction

	result = self.generate(enhanced_system, user_prompt)

	# Parse JSON if requested
	if output_format == 'json' and result['text']:
	import json
	try:
	# Try to extract JSON from the response
	text = result['text']
	# Find JSON boundaries
	start = text.find('{')
	end = text.rfind('}') + 1
	if start != -1 and end > start:
	json_str = text[start:end]
	result['parsed'] = json.loads(json_str)
	else:
	result['parsed'] = None
	result['parse_error'] = 'No JSON object found in response'
	except json.JSONDecodeError as e:
	result['parsed'] = None
	result['parse_error'] = str(e)

	return result

	def unload_model(self):
	"""Unload model from memory."""
	if self.model:
	del self.model
	self.model = None
	self._is_loaded = False
	self._model_type = None
	logger.info("Model unloaded")

	def get_model_info(self) -> Dict[str, Any]:
	"""Get information about the loaded model."""
	return {
	'is_loaded': self._is_loaded,
	'model_type': self._model_type,
	'model_path': self.llm_config.model_path if self._model_type == 'qwen' else self.llm_config.alt_model_path,
	'context_size': self.llm_config.n_ctx,
	'gpu_layers': self.llm_config.n_gpu_layers,
	'threads': self.llm_config.n_threads
	}


	# Singleton instance for reuse
	_engine_instance: Optional[LLMEngine] = None


	def get_llm_engine(config: Optional[ReportConfig] = None) -> LLMEngine:
	"""
	Get or create the LLM engine singleton.

	Args:
	config: Optional configuration override

	Returns:
	LLMEngine instance
	"""
	global _engine_instance

	if _engine_instance is None or config is not None:
	_engine_instance = LLMEngine(config)

	return _engine_instance