Spaces:
Paused
Paused
| """ | |
| LLM client — loads the model directly inside the HF Space. | |
| No external API calls needed. Works on free-tier CPU Spaces. | |
| Default model: Qwen/Qwen2.5-1.5B-Instruct (fast on CPU, no gating) | |
| """ | |
| import os | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| MODEL_ID = os.getenv("MODEL_ID", "meta-llama/Llama-3.2-8B-Instruct") | |
| SYSTEM_PROMPT = """You are Parsa's personal AI assistant. You help recruiters and hiring managers learn about Parsa Rouhi — an AI/ML engineer seeking roles in the UK. | |
| Use the provided context (retrieved from Parsa's knowledge base) to answer questions accurately. Be professional, warm, and concise. If a question isn't covered by the context, say so honestly — don't invent information. | |
| Always speak about Parsa in third person. Keep answers focused and relevant to a recruiting context.""" | |
| class LLMClient: | |
| def __init__(self): | |
| hf_token = os.getenv("HF_TOKEN") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"[LLM] Loading {MODEL_ID} on {device} ...") | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| MODEL_ID, | |
| token=hf_token, | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| token=hf_token, | |
| torch_dtype=torch.float16 if device == "cuda" else torch.float32, | |
| device_map="auto" if device == "cuda" else None, | |
| low_cpu_mem_usage=True, | |
| ) | |
| if device == "cpu": | |
| model = model.to("cpu") | |
| self.pipe = pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=self.tokenizer, | |
| device=0 if device == "cuda" else -1, | |
| ) | |
| print(f"[LLM] Model loaded successfully.") | |
| def generate( | |
| self, | |
| user_message: str, | |
| context: str, | |
| history: list[dict] | None = None, | |
| max_new_tokens: int = 512, | |
| ) -> str: | |
| history = history or [] | |
| context_note = ( | |
| f"[Relevant information from Parsa's profile]\n{context}\n" | |
| f"[End of retrieved context]" | |
| ) | |
| messages = [{"role": "system", "content": SYSTEM_PROMPT}] | |
| for turn in history[-6:]: | |
| messages.append({"role": turn["role"], "content": turn["content"]}) | |
| messages.append({ | |
| "role": "user", | |
| "content": f"{context_note}\n\nRecruiter question: {user_message}", | |
| }) | |
| output = self.pipe( | |
| messages, | |
| max_new_tokens=max_new_tokens, | |
| temperature=0.4, | |
| top_p=0.9, | |
| do_sample=True, | |
| ) | |
| # Extract only the assistant's new reply | |
| generated = output[0]["generated_text"] | |
| if isinstance(generated, list): | |
| # chat format returns list of messages | |
| return generated[-1]["content"].strip() | |
| return generated.strip() |