| import torch |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
| class LLMService: |
| def __init__(self): |
| self.model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" |
|
|
| |
| self.tokenizer = AutoTokenizer.from_pretrained( |
| self.model_name, |
| use_fast=True |
| ) |
|
|
| |
| self.model = AutoModelForCausalLM.from_pretrained( |
| self.model_name, |
| torch_dtype=torch.float32 |
| ) |
| self.model.eval() |
| print("LLM loaded | dtype:", next(self.model.parameters()).dtype) |
|
|
| def generate(self, user_query: str, context: str = "") -> str: |
| """ |
| Generates a response using the user query and optional context. |
| Context should be top-k retrieved documents from RAG. |
| """ |
|
|
| |
| prompt = f"<|user|>You are a helpful IT assistant.\n" |
| if context: |
| prompt += f"Use the following context to answer the user query:\n{context}\n" |
| prompt += f"User question: {user_query}\n<|assistant|>" |
|
|
| |
| inputs = self.tokenizer( |
| prompt, |
| return_tensors="pt", |
| truncation=True, |
| max_length=640 |
| ) |
|
|
| with torch.no_grad(): |
| output = self.model.generate( |
| **inputs, |
| max_new_tokens=200, |
| do_sample=True, |
| temperature=0.7, |
| top_p=0.9, |
| eos_token_id=self.tokenizer.eos_token_id, |
| use_cache=True |
| ) |
|
|
| text = self.tokenizer.decode(output[0], skip_special_tokens=True) |
| return self._clean(text) |
|
|
| def _clean(self, text: str) -> str: |
| """ |
| Clean model output: |
| - Extract text after <|assistant|> |
| - Stop at <|system|> or <|user|> |
| """ |
| if "<|assistant|>" in text: |
| text = text.split("<|assistant|>")[-1] |
|
|
| for stop in ("<|system|>", "<|user|>"): |
| if stop in text: |
| text = text.split(stop)[0] |
|
|
| return text.strip() |
|
|