Spaces:

Parsa2025AI
/

MyChatbot

Paused

App Files Files Community

MyChatbot / app /llm.py

Parsa2025AI

loading llm model

a8f02a6 verified about 1 month ago

raw

history blame contribute delete

2.91 kB

	"""
	LLM client — loads the model directly inside the HF Space.
	No external API calls needed. Works on free-tier CPU Spaces.
	Default model: Qwen/Qwen2.5-1.5B-Instruct (fast on CPU, no gating)
	"""

	import os
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

	MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Llama-3.2-8B-Instruct")

	SYSTEM_PROMPT = """You are Parsa's personal AI assistant. You help recruiters and hiring managers learn about Parsa Rouhi — an AI/ML engineer seeking roles in the UK.

	Use the provided context (retrieved from Parsa's knowledge base) to answer questions accurately. Be professional, warm, and concise. If a question isn't covered by the context, say so honestly — don't invent information.

	Always speak about Parsa in third person. Keep answers focused and relevant to a recruiting context."""


	class LLMClient:
	def __init__(self):
	hf_token = os.getenv("HF_TOKEN")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"[LLM] Loading {MODEL_ID} on {device} ...")

	self.tokenizer = AutoTokenizer.from_pretrained(
	MODEL_ID,
	token=hf_token,
	)

	model = AutoModelForCausalLM.from_pretrained(
	MODEL_ID,
	token=hf_token,
	torch_dtype=torch.float16 if device == "cuda" else torch.float32,
	device_map="auto" if device == "cuda" else None,
	low_cpu_mem_usage=True,
	)
	if device == "cpu":
	model = model.to("cpu")

	self.pipe = pipeline(
	"text-generation",
	model=model,
	tokenizer=self.tokenizer,
	device=0 if device == "cuda" else -1,
	)
	print(f"[LLM] Model loaded successfully.")

	def generate(
	self,
	user_message: str,
	context: str,
	history: list[dict] \| None = None,
	max_new_tokens: int = 512,
	) -> str:
	history = history or []
	context_note = (
	f"[Relevant information from Parsa's profile]\n{context}\n"
	f"[End of retrieved context]"
	)

	messages = [{"role": "system", "content": SYSTEM_PROMPT}]
	for turn in history[-6:]:
	messages.append({"role": turn["role"], "content": turn["content"]})
	messages.append({
	"role": "user",
	"content": f"{context_note}\n\nRecruiter question: {user_message}",
	})

	output = self.pipe(
	messages,
	max_new_tokens=max_new_tokens,
	temperature=0.4,
	top_p=0.9,
	do_sample=True,
	)

	# Extract only the assistant's new reply
	generated = output[0]["generated_text"]
	if isinstance(generated, list):
	# chat format returns list of messages
	return generated[-1]["content"].strip()
	return generated.strip()