""" LLM client — loads the model directly inside the HF Space. No external API calls needed. Works on free-tier CPU Spaces. Default model: Qwen/Qwen2.5-1.5B-Instruct (fast on CPU, no gating) """ import os import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Llama-3.2-8B-Instruct") SYSTEM_PROMPT = """You are Parsa's personal AI assistant. You help recruiters and hiring managers learn about Parsa Rouhi — an AI/ML engineer seeking roles in the UK. Use the provided context (retrieved from Parsa's knowledge base) to answer questions accurately. Be professional, warm, and concise. If a question isn't covered by the context, say so honestly — don't invent information. Always speak about Parsa in third person. Keep answers focused and relevant to a recruiting context.""" class LLMClient: def __init__(self): hf_token = os.getenv("HF_TOKEN") device = "cuda" if torch.cuda.is_available() else "cpu" print(f"[LLM] Loading {MODEL_ID} on {device} ...") self.tokenizer = AutoTokenizer.from_pretrained( MODEL_ID, token=hf_token, ) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, token=hf_token, torch_dtype=torch.float16 if device == "cuda" else torch.float32, device_map="auto" if device == "cuda" else None, low_cpu_mem_usage=True, ) if device == "cpu": model = model.to("cpu") self.pipe = pipeline( "text-generation", model=model, tokenizer=self.tokenizer, device=0 if device == "cuda" else -1, ) print(f"[LLM] Model loaded successfully.") def generate( self, user_message: str, context: str, history: list[dict] | None = None, max_new_tokens: int = 512, ) -> str: history = history or [] context_note = ( f"[Relevant information from Parsa's profile]\n{context}\n" f"[End of retrieved context]" ) messages = [{"role": "system", "content": SYSTEM_PROMPT}] for turn in history[-6:]: messages.append({"role": turn["role"], "content": turn["content"]}) messages.append({ "role": "user", "content": f"{context_note}\n\nRecruiter question: {user_message}", }) output = self.pipe( messages, max_new_tokens=max_new_tokens, temperature=0.4, top_p=0.9, do_sample=True, ) # Extract only the assistant's new reply generated = output[0]["generated_text"] if isinstance(generated, list): # chat format returns list of messages return generated[-1]["content"].strip() return generated.strip()