Spaces:

Sandei
/

tech-support-helpdesk-chatbot

Sleeping

App Files Files Community

tech-support-helpdesk-chatbot / service /llm_service.py

Sandei

llm service update

1219363 2 months ago

raw

history blame contribute delete

2.26 kB

	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM

	class LLMService:
	def __init__(self):
	self.model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

	# Tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(
	self.model_name,
	use_fast=True
	)

	# Load model in FP32 on CPU
	self.model = AutoModelForCausalLM.from_pretrained(
	self.model_name,
	torch_dtype=torch.float32
	)
	self.model.eval()
	print("LLM loaded \| dtype:", next(self.model.parameters()).dtype)

	def generate(self, user_query: str, context: str = "") -> str:
	"""
	Generates a response using the user query and optional context.
	Context should be top-k retrieved documents from RAG.
	"""

	# Build a clear prompt for TinyLlama
	prompt = f"<\|user\|>You are a helpful IT assistant.\n"
	if context:
	prompt += f"Use the following context to answer the user query:\n{context}\n"
	prompt += f"User question: {user_query}\n<\|assistant\|>"

	# Tokenize
	inputs = self.tokenizer(
	prompt,
	return_tensors="pt",
	truncation=True,
	max_length=640
	)

	with torch.no_grad():
	output = self.model.generate(
	**inputs,
	max_new_tokens=200, # enough for complete step-by-step answer
	do_sample=True, # allows richer outputs
	temperature=0.7, # creative but not random
	top_p=0.9, # nucleus sampling
	eos_token_id=self.tokenizer.eos_token_id,
	use_cache=True
	)

	text = self.tokenizer.decode(output[0], skip_special_tokens=True)
	return self._clean(text)

	def _clean(self, text: str) -> str:
	"""
	Clean model output:
	- Extract text after <\|assistant\|>
	- Stop at <\|system\|> or <\|user\|>
	"""
	if "<\|assistant\|>" in text:
	text = text.split("<\|assistant\|>")[-1]

	for stop in ("<\|system\|>", "<\|user\|>"):
	if stop in text:
	text = text.split(stop)[0]

	return text.strip()