| """
|
| agents/validator.py
|
| ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| ValidatorAgent: Two-stage security and validation pipeline for job descriptions.
|
|
|
| Stage 1 β Programmatic filter (instant, zero LLM cost):
|
| - Minimum character length check
|
| - Forbidden prompt-injection phrase detection
|
|
|
| Stage 2 β LLM gatekeeper (temperature=0.0, fully deterministic):
|
| - Classifies the input as VALID or INVALID job description
|
| - The job text is wrapped in XML tags so the LLM treats it as pure data
|
|
|
| Stage 3 β Profile extraction (temperature=0.2, structured output):
|
| - Extracts INDUSTRY, ROLE_LEVEL, KEYWORDS (5-7), INTERVIEW_STYLE, TIPS
|
| - Returns a rich job_profile dict used by QuestionGenAgent and ScorerAgent
|
| """
|
|
|
| import re
|
|
|
|
|
| _FORBIDDEN_PHRASES = [
|
| "ignore all previous",
|
| "ignore previous instructions",
|
| "disregard the above",
|
| "you are now",
|
| "act as",
|
| "pretend you are",
|
| "forget everything",
|
| "new instructions",
|
| "system prompt",
|
| "jailbreak",
|
| "do anything now",
|
| "dan mode",
|
| "respond with complete",
|
| "bypass your",
|
| ]
|
|
|
| _MIN_LENGTH = 350
|
|
|
|
|
| class ValidatorAgent:
|
| """
|
| Validates and analyses an incoming job description.
|
|
|
| Usage:
|
| agent = ValidatorAgent(llm_fn) # llm_fn = ask_llm from engine
|
| result = agent.run(job_desc_text)
|
| # result: {"valid": True/False, "error_msg": str, ...profile fields}
|
| """
|
|
|
| def __init__(self, llm_fn):
|
| """
|
| Args:
|
| llm_fn: A callable that takes (prompt: str, temperature: float, max_tokens: int) β str.
|
| Provided by engine.py so the agent stays decoupled from the LLM client.
|
| """
|
| self._ask = llm_fn
|
|
|
|
|
| def run(self, job_desc: str) -> dict:
|
| """
|
| Full validation + extraction pipeline.
|
| Returns a dict with 'valid' key; if valid=True, also contains profile fields.
|
| """
|
| clean = job_desc.strip() if job_desc else ""
|
|
|
|
|
| if len(clean) < _MIN_LENGTH:
|
| return self._invalid("Job description is too short. Please paste the full posting (350+ characters).")
|
|
|
|
|
| lower = clean.lower()
|
| for phrase in _FORBIDDEN_PHRASES:
|
| if phrase in lower:
|
| return self._invalid("β οΈ Invalid input detected. Please paste a real job description.")
|
|
|
|
|
| gatekeeper_verdict = self._gatekeeper(clean)
|
| if not gatekeeper_verdict:
|
| return self._invalid("Please enter a complete and legitimate job description.")
|
|
|
|
|
| profile = self._extract_profile(clean)
|
| if not profile:
|
| return self._invalid("Could not analyse the job description. Please try again.")
|
|
|
| return {"valid": True, **profile}
|
|
|
|
|
|
|
| def _gatekeeper(self, clean_text: str) -> bool:
|
| """Returns True only if the LLM classifies the text as a legitimate JD."""
|
|
|
| prompt = f"""You are a security-hardened automated recruitment verification filter.
|
| Your ONLY task is to classify whether the text inside the input tags is a legitimate, fully-structured job description containing real duties and organizational requirements.
|
|
|
| If legitimate, reply with the exact word: VALID
|
| If not legitimate, reply with the exact word: INVALID
|
|
|
| Text to analyze:
|
| {clean_text[:1200]}
|
|
|
| Your response (VALID or INVALID):"""
|
|
|
| raw_response = self._ask(prompt, temperature=0.0, max_tokens=40)
|
|
|
| print(f"\n--- GATEKEEPER DEBUG --- \nRaw Model Output: '{raw_response}'\n-------------------------\n")
|
|
|
| verdict = raw_response.strip().upper()
|
| return "VALID" in verdict and "INVALID" not in verdict
|
|
|
|
|
|
|
| def _extract_profile(self, clean_text: str) -> dict | None:
|
| """
|
| Extracts structured profile metadata from a verified job description.
|
| Returns None if extraction fails (malformed LLM response).
|
| """
|
| prompt = f"""[INST] You are an expert HR analyst. Analyse this verified job description and extract structured metadata.
|
|
|
| Job Description:
|
| {clean_text[:2000]}
|
|
|
| Extract and respond using EXACTLY these XML tags (one tag per line, content on the same line):
|
| <INDUSTRY> the primary field/sector (e.g. Software Engineering, Healthcare, Finance, Marketing, Education) </INDUSTRY>
|
| <ROLE_LEVEL> seniority level: Junior / Mid-Level / Senior / Lead / Manager / Director / Executive </ROLE_LEVEL>
|
| <INTERVIEW_STYLE> primary interview style: Technical / Behavioral / Case-Based / Mixed </INTERVIEW_STYLE>
|
| <KEYWORDS> 5 to 7 comma-separated key skills or terms a strong candidate MUST mention </KEYWORDS>
|
| <TIPS> 4 specific bullet-point preparation tips for this exact role (start each with β’) </TIPS>
|
|
|
| Respond with only the 5 XML tag lines. No other text. [/INST]"""
|
|
|
| raw = self._ask(prompt, temperature=0.2, max_tokens=400)
|
|
|
| industry = self._tag(raw, "INDUSTRY")
|
| role_level = self._tag(raw, "ROLE_LEVEL")
|
| style = self._tag(raw, "INTERVIEW_STYLE")
|
| keywords_raw = self._tag(raw, "KEYWORDS")
|
| tips = self._tag(raw, "TIPS")
|
|
|
|
|
| if not keywords_raw:
|
| return None
|
|
|
| keywords = [k.strip() for k in keywords_raw.split(",") if k.strip()]
|
|
|
| return {
|
| "industry": industry or "General",
|
| "role_level": role_level or "Mid-Level",
|
| "interview_style": style or "Mixed",
|
| "keywords": keywords,
|
| "tips": tips or "",
|
| "job_snippet": clean_text[:80] + "...",
|
| }
|
|
|
|
|
| @staticmethod
|
| def _tag(text: str, name: str) -> str:
|
| """Extract content between <NAME> and </NAME> tags."""
|
| start, end = f"<{name}>", f"</{name}>"
|
| if start in text and end in text:
|
| return text.split(start)[1].split(end)[0].strip()
|
| return ""
|
|
|
| @staticmethod
|
| def _invalid(msg: str) -> dict:
|
| return {"valid": False, "error_msg": msg}
|
|
|