MyChatbot / app /llm.py
Parsa2025AI's picture
loading llm model
a8f02a6 verified
"""
LLM client — loads the model directly inside the HF Space.
No external API calls needed. Works on free-tier CPU Spaces.
Default model: Qwen/Qwen2.5-1.5B-Instruct (fast on CPU, no gating)
"""
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Llama-3.2-8B-Instruct")
SYSTEM_PROMPT = """You are Parsa's personal AI assistant. You help recruiters and hiring managers learn about Parsa Rouhi — an AI/ML engineer seeking roles in the UK.
Use the provided context (retrieved from Parsa's knowledge base) to answer questions accurately. Be professional, warm, and concise. If a question isn't covered by the context, say so honestly — don't invent information.
Always speak about Parsa in third person. Keep answers focused and relevant to a recruiting context."""
class LLMClient:
def __init__(self):
hf_token = os.getenv("HF_TOKEN")
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[LLM] Loading {MODEL_ID} on {device} ...")
self.tokenizer = AutoTokenizer.from_pretrained(
MODEL_ID,
token=hf_token,
)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
token=hf_token,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
device_map="auto" if device == "cuda" else None,
low_cpu_mem_usage=True,
)
if device == "cpu":
model = model.to("cpu")
self.pipe = pipeline(
"text-generation",
model=model,
tokenizer=self.tokenizer,
device=0 if device == "cuda" else -1,
)
print(f"[LLM] Model loaded successfully.")
def generate(
self,
user_message: str,
context: str,
history: list[dict] | None = None,
max_new_tokens: int = 512,
) -> str:
history = history or []
context_note = (
f"[Relevant information from Parsa's profile]\n{context}\n"
f"[End of retrieved context]"
)
messages = [{"role": "system", "content": SYSTEM_PROMPT}]
for turn in history[-6:]:
messages.append({"role": turn["role"], "content": turn["content"]})
messages.append({
"role": "user",
"content": f"{context_note}\n\nRecruiter question: {user_message}",
})
output = self.pipe(
messages,
max_new_tokens=max_new_tokens,
temperature=0.4,
top_p=0.9,
do_sample=True,
)
# Extract only the assistant's new reply
generated = output[0]["generated_text"]
if isinstance(generated, list):
# chat format returns list of messages
return generated[-1]["content"].strip()
return generated.strip()