Spaces:

blacksinisterx
/

DetectifAI-Backend

Running

File size: 11,304 Bytes

fd50325

"""

Local LLM Engine for Report Generation



Handles loading and inference with local GGUF models using llama-cpp-python.

Supports both Qwen2.5-3B-Instruct and Phi-3-mini as fallback.

"""

import os
import logging
from typing import Optional, Dict, Any, List
from pathlib import Path

from .config import ReportConfig, LLMConfig

logger = logging.getLogger(__name__)


class LLMEngine:
    """

    Local LLM engine using llama-cpp-python.

    

    Provides deterministic, instruction-following text generation

    for structured report content.

    """
    
    def __init__(self, config: Optional[ReportConfig] = None):
        """

        Initialize the LLM engine.

        

        Args:

            config: Report configuration (uses default if None)

        """
        self.config = config or ReportConfig()
        self.llm_config = self.config.llm
        self.model = None
        self._is_loaded = False
        self._model_type = None  # 'qwen' or 'phi'
        
    @property
    def is_loaded(self) -> bool:
        """Check if model is loaded."""
        return self._is_loaded
    
    def download_model(self, use_alternative: bool = False) -> str:
        """

        Download model from HuggingFace Hub.

        

        Args:

            use_alternative: If True, download Phi-3 instead of Qwen

            

        Returns:

            Path to downloaded model

        """
        try:
            from huggingface_hub import hf_hub_download
        except ImportError:
            raise ImportError(
                "huggingface_hub is required to download models. "
                "Install with: pip install huggingface_hub"
            )
        
        if use_alternative:
            repo_id = self.llm_config.alt_hf_repo
            filename = self.llm_config.alt_hf_filename
            local_path = self.llm_config.alt_model_path
        else:
            repo_id = self.llm_config.hf_repo
            filename = self.llm_config.hf_filename
            local_path = self.llm_config.model_path
        
        logger.info(f"Downloading model from {repo_id}/{filename}...")
        
        # Download to models directory
        downloaded_path = hf_hub_download(
            repo_id=repo_id,
            filename=filename,
            local_dir=self.llm_config.models_dir,
            local_dir_use_symlinks=False
        )
        
        logger.info(f"Model downloaded to: {downloaded_path}")
        return downloaded_path
    
    def load_model(self, force_reload: bool = False) -> bool:
        """

        Load the LLM model into memory.

        

        Args:

            force_reload: Force reload even if already loaded

            

        Returns:

            True if successful, False otherwise

        """
        if self._is_loaded and not force_reload:
            logger.info("Model already loaded")
            return True
        
        try:
            from llama_cpp import Llama
        except ImportError:
            logger.warning(
                "llama-cpp-python is not installed — LLM report generation disabled. "
                "Reports will use template-based fallback."
            )
            self._is_loaded = False
            return False
        
        # Try primary model first, then alternative
        model_path = self.llm_config.model_path
        if not os.path.exists(model_path):
            model_path = self.llm_config.alt_model_path
            self._model_type = 'phi'
        else:
            self._model_type = 'qwen'
        
        if not os.path.exists(model_path):
            logger.warning("No model found. Attempting to download...")
            try:
                model_path = self.download_model()
                self._model_type = 'qwen'
            except Exception as e:
                logger.error(f"Failed to download model: {e}")
                return False
        
        logger.info(f"Loading model from: {model_path}")
        logger.info(f"Model type: {self._model_type}")
        
        try:
            self.model = Llama(
                model_path=model_path,
                n_ctx=self.llm_config.n_ctx,
                n_threads=self.llm_config.n_threads,
                n_gpu_layers=self.llm_config.n_gpu_layers,
                verbose=False
            )
            self._is_loaded = True
            logger.info("✅ Model loaded successfully")
            return True
            
        except Exception as e:
            logger.error(f"Failed to load model: {e}")
            self._is_loaded = False
            return False
    
    def _format_prompt(self, system_prompt: str, user_prompt: str) -> str:
        """

        Format prompt according to model's chat template.

        

        Args:

            system_prompt: System instructions

            user_prompt: User's request

            

        Returns:

            Formatted prompt string

        """
        if self._model_type == 'qwen':
            # Qwen2.5 chat format
            return f"""<|im_start|>system

{system_prompt}<|im_end|>

<|im_start|>user

{user_prompt}<|im_end|>

<|im_start|>assistant

"""
        else:
            # Phi-3 chat format
            return f"""<|system|>

{system_prompt}<|end|>

<|user|>

{user_prompt}<|end|>

<|assistant|>

"""
    
    def generate(

        self,

        system_prompt: str,

        user_prompt: str,

        max_tokens: Optional[int] = None,

        temperature: Optional[float] = None,

        stop_sequences: Optional[List[str]] = None

    ) -> Dict[str, Any]:
        """

        Generate text using the loaded LLM.

        

        Args:

            system_prompt: System instructions for the model

            user_prompt: The actual prompt/request

            max_tokens: Override max tokens (uses config default if None)

            temperature: Override temperature (uses config default if None)

            stop_sequences: Custom stop sequences

            

        Returns:

            Dict with 'text', 'tokens_used', 'finish_reason'

        """
        if not self._is_loaded:
            if not self.load_model():
                return {
                    'text': '',
                    'tokens_used': 0,
                    'finish_reason': 'error',
                    'error': 'Model not loaded'
                }
        
        # Format the prompt
        formatted_prompt = self._format_prompt(system_prompt, user_prompt)
        
        # Set parameters
        max_tokens = max_tokens or self.llm_config.max_tokens
        temperature = temperature or self.llm_config.temperature
        
        # Default stop sequences based on model type
        if stop_sequences is None:
            if self._model_type == 'qwen':
                stop_sequences = ["<|im_end|>", "<|im_start|>"]
            else:
                stop_sequences = ["<|end|>", "<|user|>"]
        
        logger.debug(f"Generating with max_tokens={max_tokens}, temp={temperature}")
        
        try:
            output = self.model(
                formatted_prompt,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=self.llm_config.top_p,
                repeat_penalty=self.llm_config.repeat_penalty,
                stop=stop_sequences,
                echo=False
            )
            
            generated_text = output['choices'][0]['text'].strip()
            finish_reason = output['choices'][0].get('finish_reason', 'stop')
            tokens_used = output.get('usage', {}).get('total_tokens', 0)
            
            return {
                'text': generated_text,
                'tokens_used': tokens_used,
                'finish_reason': finish_reason,
                'error': None
            }
            
        except Exception as e:
            logger.error(f"Generation error: {e}")
            return {
                'text': '',
                'tokens_used': 0,
                'finish_reason': 'error',
                'error': str(e)
            }
    
    def generate_structured(

        self,

        system_prompt: str,

        user_prompt: str,

        output_format: str = 'markdown'

    ) -> Dict[str, Any]:
        """

        Generate structured output (Markdown or JSON).

        

        Args:

            system_prompt: System instructions

            user_prompt: User request

            output_format: 'markdown' or 'json'

            

        Returns:

            Dict with generated content

        """
        # Add format instructions to system prompt
        if output_format == 'json':
            format_instruction = "\nYou MUST respond with valid JSON only. No explanations outside the JSON."
        else:
            format_instruction = "\nYou MUST respond with properly formatted Markdown only."
        
        enhanced_system = system_prompt + format_instruction
        
        result = self.generate(enhanced_system, user_prompt)
        
        # Parse JSON if requested
        if output_format == 'json' and result['text']:
            import json
            try:
                # Try to extract JSON from the response
                text = result['text']
                # Find JSON boundaries
                start = text.find('{')
                end = text.rfind('}') + 1
                if start != -1 and end > start:
                    json_str = text[start:end]
                    result['parsed'] = json.loads(json_str)
                else:
                    result['parsed'] = None
                    result['parse_error'] = 'No JSON object found in response'
            except json.JSONDecodeError as e:
                result['parsed'] = None
                result['parse_error'] = str(e)
        
        return result
    
    def unload_model(self):
        """Unload model from memory."""
        if self.model:
            del self.model
            self.model = None
        self._is_loaded = False
        self._model_type = None
        logger.info("Model unloaded")
    
    def get_model_info(self) -> Dict[str, Any]:
        """Get information about the loaded model."""
        return {
            'is_loaded': self._is_loaded,
            'model_type': self._model_type,
            'model_path': self.llm_config.model_path if self._model_type == 'qwen' else self.llm_config.alt_model_path,
            'context_size': self.llm_config.n_ctx,
            'gpu_layers': self.llm_config.n_gpu_layers,
            'threads': self.llm_config.n_threads
        }


# Singleton instance for reuse
_engine_instance: Optional[LLMEngine] = None


def get_llm_engine(config: Optional[ReportConfig] = None) -> LLMEngine:
    """

    Get or create the LLM engine singleton.

    

    Args:

        config: Optional configuration override

        

    Returns:

        LLMEngine instance

    """
    global _engine_instance
    
    if _engine_instance is None or config is not None:
        _engine_instance = LLMEngine(config)
    
    return _engine_instance