Sandei's picture
llm service update
1219363
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
class LLMService:
def __init__(self):
self.model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# Tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name,
use_fast=True
)
# Load model in FP32 on CPU
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch.float32
)
self.model.eval()
print("LLM loaded | dtype:", next(self.model.parameters()).dtype)
def generate(self, user_query: str, context: str = "") -> str:
"""
Generates a response using the user query and optional context.
Context should be top-k retrieved documents from RAG.
"""
# Build a clear prompt for TinyLlama
prompt = f"<|user|>You are a helpful IT assistant.\n"
if context:
prompt += f"Use the following context to answer the user query:\n{context}\n"
prompt += f"User question: {user_query}\n<|assistant|>"
# Tokenize
inputs = self.tokenizer(
prompt,
return_tensors="pt",
truncation=True,
max_length=640
)
with torch.no_grad():
output = self.model.generate(
**inputs,
max_new_tokens=200, # enough for complete step-by-step answer
do_sample=True, # allows richer outputs
temperature=0.7, # creative but not random
top_p=0.9, # nucleus sampling
eos_token_id=self.tokenizer.eos_token_id,
use_cache=True
)
text = self.tokenizer.decode(output[0], skip_special_tokens=True)
return self._clean(text)
def _clean(self, text: str) -> str:
"""
Clean model output:
- Extract text after <|assistant|>
- Stop at <|system|> or <|user|>
"""
if "<|assistant|>" in text:
text = text.split("<|assistant|>")[-1]
for stop in ("<|system|>", "<|user|>"):
if stop in text:
text = text.split(stop)[0]
return text.strip()