File size: 11,304 Bytes
fd50325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
"""

Local LLM Engine for Report Generation



Handles loading and inference with local GGUF models using llama-cpp-python.

Supports both Qwen2.5-3B-Instruct and Phi-3-mini as fallback.

"""

import os
import logging
from typing import Optional, Dict, Any, List
from pathlib import Path

from .config import ReportConfig, LLMConfig

logger = logging.getLogger(__name__)


class LLMEngine:
    """

    Local LLM engine using llama-cpp-python.

    

    Provides deterministic, instruction-following text generation

    for structured report content.

    """
    
    def __init__(self, config: Optional[ReportConfig] = None):
        """

        Initialize the LLM engine.

        

        Args:

            config: Report configuration (uses default if None)

        """
        self.config = config or ReportConfig()
        self.llm_config = self.config.llm
        self.model = None
        self._is_loaded = False
        self._model_type = None  # 'qwen' or 'phi'
        
    @property
    def is_loaded(self) -> bool:
        """Check if model is loaded."""
        return self._is_loaded
    
    def download_model(self, use_alternative: bool = False) -> str:
        """

        Download model from HuggingFace Hub.

        

        Args:

            use_alternative: If True, download Phi-3 instead of Qwen

            

        Returns:

            Path to downloaded model

        """
        try:
            from huggingface_hub import hf_hub_download
        except ImportError:
            raise ImportError(
                "huggingface_hub is required to download models. "
                "Install with: pip install huggingface_hub"
            )
        
        if use_alternative:
            repo_id = self.llm_config.alt_hf_repo
            filename = self.llm_config.alt_hf_filename
            local_path = self.llm_config.alt_model_path
        else:
            repo_id = self.llm_config.hf_repo
            filename = self.llm_config.hf_filename
            local_path = self.llm_config.model_path
        
        logger.info(f"Downloading model from {repo_id}/{filename}...")
        
        # Download to models directory
        downloaded_path = hf_hub_download(
            repo_id=repo_id,
            filename=filename,
            local_dir=self.llm_config.models_dir,
            local_dir_use_symlinks=False
        )
        
        logger.info(f"Model downloaded to: {downloaded_path}")
        return downloaded_path
    
    def load_model(self, force_reload: bool = False) -> bool:
        """

        Load the LLM model into memory.

        

        Args:

            force_reload: Force reload even if already loaded

            

        Returns:

            True if successful, False otherwise

        """
        if self._is_loaded and not force_reload:
            logger.info("Model already loaded")
            return True
        
        try:
            from llama_cpp import Llama
        except ImportError:
            logger.warning(
                "llama-cpp-python is not installed — LLM report generation disabled. "
                "Reports will use template-based fallback."
            )
            self._is_loaded = False
            return False
        
        # Try primary model first, then alternative
        model_path = self.llm_config.model_path
        if not os.path.exists(model_path):
            model_path = self.llm_config.alt_model_path
            self._model_type = 'phi'
        else:
            self._model_type = 'qwen'
        
        if not os.path.exists(model_path):
            logger.warning("No model found. Attempting to download...")
            try:
                model_path = self.download_model()
                self._model_type = 'qwen'
            except Exception as e:
                logger.error(f"Failed to download model: {e}")
                return False
        
        logger.info(f"Loading model from: {model_path}")
        logger.info(f"Model type: {self._model_type}")
        
        try:
            self.model = Llama(
                model_path=model_path,
                n_ctx=self.llm_config.n_ctx,
                n_threads=self.llm_config.n_threads,
                n_gpu_layers=self.llm_config.n_gpu_layers,
                verbose=False
            )
            self._is_loaded = True
            logger.info("✅ Model loaded successfully")
            return True
            
        except Exception as e:
            logger.error(f"Failed to load model: {e}")
            self._is_loaded = False
            return False
    
    def _format_prompt(self, system_prompt: str, user_prompt: str) -> str:
        """

        Format prompt according to model's chat template.

        

        Args:

            system_prompt: System instructions

            user_prompt: User's request

            

        Returns:

            Formatted prompt string

        """
        if self._model_type == 'qwen':
            # Qwen2.5 chat format
            return f"""<|im_start|>system

{system_prompt}<|im_end|>

<|im_start|>user

{user_prompt}<|im_end|>

<|im_start|>assistant

"""
        else:
            # Phi-3 chat format
            return f"""<|system|>

{system_prompt}<|end|>

<|user|>

{user_prompt}<|end|>

<|assistant|>

"""
    
    def generate(

        self,

        system_prompt: str,

        user_prompt: str,

        max_tokens: Optional[int] = None,

        temperature: Optional[float] = None,

        stop_sequences: Optional[List[str]] = None

    ) -> Dict[str, Any]:
        """

        Generate text using the loaded LLM.

        

        Args:

            system_prompt: System instructions for the model

            user_prompt: The actual prompt/request

            max_tokens: Override max tokens (uses config default if None)

            temperature: Override temperature (uses config default if None)

            stop_sequences: Custom stop sequences

            

        Returns:

            Dict with 'text', 'tokens_used', 'finish_reason'

        """
        if not self._is_loaded:
            if not self.load_model():
                return {
                    'text': '',
                    'tokens_used': 0,
                    'finish_reason': 'error',
                    'error': 'Model not loaded'
                }
        
        # Format the prompt
        formatted_prompt = self._format_prompt(system_prompt, user_prompt)
        
        # Set parameters
        max_tokens = max_tokens or self.llm_config.max_tokens
        temperature = temperature or self.llm_config.temperature
        
        # Default stop sequences based on model type
        if stop_sequences is None:
            if self._model_type == 'qwen':
                stop_sequences = ["<|im_end|>", "<|im_start|>"]
            else:
                stop_sequences = ["<|end|>", "<|user|>"]
        
        logger.debug(f"Generating with max_tokens={max_tokens}, temp={temperature}")
        
        try:
            output = self.model(
                formatted_prompt,
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=self.llm_config.top_p,
                repeat_penalty=self.llm_config.repeat_penalty,
                stop=stop_sequences,
                echo=False
            )
            
            generated_text = output['choices'][0]['text'].strip()
            finish_reason = output['choices'][0].get('finish_reason', 'stop')
            tokens_used = output.get('usage', {}).get('total_tokens', 0)
            
            return {
                'text': generated_text,
                'tokens_used': tokens_used,
                'finish_reason': finish_reason,
                'error': None
            }
            
        except Exception as e:
            logger.error(f"Generation error: {e}")
            return {
                'text': '',
                'tokens_used': 0,
                'finish_reason': 'error',
                'error': str(e)
            }
    
    def generate_structured(

        self,

        system_prompt: str,

        user_prompt: str,

        output_format: str = 'markdown'

    ) -> Dict[str, Any]:
        """

        Generate structured output (Markdown or JSON).

        

        Args:

            system_prompt: System instructions

            user_prompt: User request

            output_format: 'markdown' or 'json'

            

        Returns:

            Dict with generated content

        """
        # Add format instructions to system prompt
        if output_format == 'json':
            format_instruction = "\nYou MUST respond with valid JSON only. No explanations outside the JSON."
        else:
            format_instruction = "\nYou MUST respond with properly formatted Markdown only."
        
        enhanced_system = system_prompt + format_instruction
        
        result = self.generate(enhanced_system, user_prompt)
        
        # Parse JSON if requested
        if output_format == 'json' and result['text']:
            import json
            try:
                # Try to extract JSON from the response
                text = result['text']
                # Find JSON boundaries
                start = text.find('{')
                end = text.rfind('}') + 1
                if start != -1 and end > start:
                    json_str = text[start:end]
                    result['parsed'] = json.loads(json_str)
                else:
                    result['parsed'] = None
                    result['parse_error'] = 'No JSON object found in response'
            except json.JSONDecodeError as e:
                result['parsed'] = None
                result['parse_error'] = str(e)
        
        return result
    
    def unload_model(self):
        """Unload model from memory."""
        if self.model:
            del self.model
            self.model = None
        self._is_loaded = False
        self._model_type = None
        logger.info("Model unloaded")
    
    def get_model_info(self) -> Dict[str, Any]:
        """Get information about the loaded model."""
        return {
            'is_loaded': self._is_loaded,
            'model_type': self._model_type,
            'model_path': self.llm_config.model_path if self._model_type == 'qwen' else self.llm_config.alt_model_path,
            'context_size': self.llm_config.n_ctx,
            'gpu_layers': self.llm_config.n_gpu_layers,
            'threads': self.llm_config.n_threads
        }


# Singleton instance for reuse
_engine_instance: Optional[LLMEngine] = None


def get_llm_engine(config: Optional[ReportConfig] = None) -> LLMEngine:
    """

    Get or create the LLM engine singleton.

    

    Args:

        config: Optional configuration override

        

    Returns:

        LLMEngine instance

    """
    global _engine_instance
    
    if _engine_instance is None or config is not None:
        _engine_instance = LLMEngine(config)
    
    return _engine_instance