| | """ |
| | Feature Extraction Layer |
| | |
| | Uses LLM to extract structured features from raw text inputs. |
| | Designed as a swappable module: v1 uses LLM, v2 can use fine-tuned models or NER. |
| | """ |
| |
|
| | import json |
| | import os |
| | from typing import Optional |
| |
|
| | from .input_processor import ProcessedInput |
| | from .prompts.feature_extraction import ( |
| | ROLE_FEATURE_EXTRACTION_PROMPT, |
| | CANDIDATE_FEATURE_EXTRACTION_PROMPT, |
| | MATCH_ANALYSIS_PROMPT, |
| | ) |
| |
|
| |
|
| | class LLMClient: |
| | """Abstraction over LLM provider. Swap this for different backends.""" |
| |
|
| | def __init__(self, provider: str = "anthropic", model: Optional[str] = None): |
| | self.provider = provider |
| | self.model = model or self._default_model() |
| |
|
| | def _default_model(self) -> str: |
| | defaults = { |
| | "anthropic": "claude-sonnet-4-20250514", |
| | "openai": "gpt-4o", |
| | "google": "gemini-2.5-flash-lite", |
| | } |
| | return defaults.get(self.provider, "claude-sonnet-4-20250514") |
| |
|
| | def complete(self, prompt: str, temperature: float = 0.1) -> str: |
| | """Send prompt to LLM and return response text.""" |
| | if self.provider == "anthropic": |
| | return self._call_anthropic(prompt, temperature) |
| | elif self.provider == "openai": |
| | return self._call_openai(prompt, temperature) |
| | elif self.provider == "google": |
| | return self._call_google(prompt, temperature) |
| | else: |
| | raise ValueError(f"Unsupported provider: {self.provider}") |
| |
|
| | def _call_anthropic(self, prompt: str, temperature: float) -> str: |
| | import anthropic |
| |
|
| | client = anthropic.Anthropic() |
| | response = client.messages.create( |
| | model=self.model, |
| | max_tokens=4096, |
| | temperature=temperature, |
| | messages=[{"role": "user", "content": prompt}], |
| | ) |
| | return response.content[0].text |
| |
|
| | def _call_openai(self, prompt: str, temperature: float) -> str: |
| | from openai import OpenAI |
| |
|
| | client = OpenAI() |
| | response = client.chat.completions.create( |
| | model=self.model, |
| | temperature=temperature, |
| | messages=[{"role": "user", "content": prompt}], |
| | ) |
| | return response.choices[0].message.content |
| |
|
| | def _call_google(self, prompt: str, temperature: float) -> str: |
| | import google.generativeai as genai |
| |
|
| | api_key = os.environ.get("GEMINI_API_KEY") |
| | if not api_key: |
| | raise ValueError("GEMINI_API_KEY environment variable is not set") |
| |
|
| | genai.configure(api_key=api_key) |
| | model = genai.GenerativeModel(self.model) |
| | try: |
| | response = model.generate_content( |
| | prompt, |
| | generation_config=genai.GenerationConfig(temperature=temperature), |
| | ) |
| | return response.text |
| | except Exception as e: |
| | raise RuntimeError(f"Gemini API call failed (model={self.model}): {e}") |
| |
|
| |
|
| | def _extract_json(text: str) -> dict: |
| | """Extract JSON from LLM response, handling markdown code fences.""" |
| | import re |
| |
|
| | text = text.strip() |
| |
|
| | |
| | try: |
| | return json.loads(text) |
| | except json.JSONDecodeError: |
| | pass |
| |
|
| | |
| | if "```" in text: |
| | |
| | match = re.search(r"```(?:json)?\s*\n(.*?)\n\s*```", text, re.DOTALL) |
| | if match: |
| | try: |
| | return json.loads(match.group(1).strip()) |
| | except json.JSONDecodeError: |
| | pass |
| |
|
| | |
| | lines = text.split("\n") |
| | if lines[0].strip().startswith("```"): |
| | lines = lines[1:] |
| | if lines and lines[-1].strip() == "```": |
| | lines = lines[:-1] |
| | try: |
| | return json.loads("\n".join(lines)) |
| | except json.JSONDecodeError: |
| | pass |
| |
|
| | |
| | match = re.search(r"\{.*\}", text, re.DOTALL) |
| | if match: |
| | try: |
| | return json.loads(match.group(0)) |
| | except json.JSONDecodeError: |
| | pass |
| |
|
| | raise json.JSONDecodeError( |
| | f"Could not extract JSON from LLM response (first 200 chars): {text[:200]}", |
| | text, 0 |
| | ) |
| |
|
| |
|
| | class FeatureExtractor: |
| | """Extracts structured features from processed inputs using LLM.""" |
| |
|
| | def __init__(self, llm_client: Optional[LLMClient] = None): |
| | self.llm = llm_client or LLMClient() |
| |
|
| | def extract_role_features(self, processed: ProcessedInput) -> dict: |
| | """Extract structured features from job description.""" |
| | prompt = ROLE_FEATURE_EXTRACTION_PROMPT.format( |
| | job_description=processed.job_description, |
| | company_stage=processed.company_context.stage, |
| | industry=processed.company_context.industry, |
| | compensation_band=processed.company_context.compensation_band, |
| | location=processed.company_context.location, |
| | remote_type=processed.company_context.remote_type, |
| | ) |
| | response = self.llm.complete(prompt) |
| | return _extract_json(response) |
| |
|
| | def extract_candidate_features(self, processed: ProcessedInput) -> dict: |
| | """Extract structured features from resume.""" |
| | prompt = CANDIDATE_FEATURE_EXTRACTION_PROMPT.format( |
| | resume_text=processed.resume_text, |
| | ) |
| | response = self.llm.complete(prompt) |
| | return _extract_json(response) |
| |
|
| | def analyze_match( |
| | self, |
| | role_features: dict, |
| | candidate_features: dict, |
| | processed: ProcessedInput, |
| | ) -> dict: |
| | """Analyze the match between role and candidate features.""" |
| | prompt = MATCH_ANALYSIS_PROMPT.format( |
| | role_features=json.dumps(role_features, indent=2), |
| | candidate_features=json.dumps(candidate_features, indent=2), |
| | company_stage=processed.company_context.stage, |
| | industry=processed.company_context.industry, |
| | compensation_band=processed.company_context.compensation_band, |
| | remote_type=processed.company_context.remote_type, |
| | ) |
| | response = self.llm.complete(prompt) |
| | return _extract_json(response) |
| |
|