""" Feature Extraction Layer Uses LLM to extract structured features from raw text inputs. Designed as a swappable module: v1 uses LLM, v2 can use fine-tuned models or NER. """ import json import os from typing import Optional from .input_processor import ProcessedInput from .prompts.feature_extraction import ( ROLE_FEATURE_EXTRACTION_PROMPT, CANDIDATE_FEATURE_EXTRACTION_PROMPT, MATCH_ANALYSIS_PROMPT, ) class LLMClient: """Abstraction over LLM provider. Swap this for different backends.""" def __init__(self, provider: str = "anthropic", model: Optional[str] = None): self.provider = provider self.model = model or self._default_model() def _default_model(self) -> str: defaults = { "anthropic": "claude-sonnet-4-20250514", "openai": "gpt-4o", "google": "gemini-2.5-flash-lite", } return defaults.get(self.provider, "claude-sonnet-4-20250514") def complete(self, prompt: str, temperature: float = 0.1) -> str: """Send prompt to LLM and return response text.""" if self.provider == "anthropic": return self._call_anthropic(prompt, temperature) elif self.provider == "openai": return self._call_openai(prompt, temperature) elif self.provider == "google": return self._call_google(prompt, temperature) else: raise ValueError(f"Unsupported provider: {self.provider}") def _call_anthropic(self, prompt: str, temperature: float) -> str: import anthropic client = anthropic.Anthropic() response = client.messages.create( model=self.model, max_tokens=4096, temperature=temperature, messages=[{"role": "user", "content": prompt}], ) return response.content[0].text def _call_openai(self, prompt: str, temperature: float) -> str: from openai import OpenAI client = OpenAI() response = client.chat.completions.create( model=self.model, temperature=temperature, messages=[{"role": "user", "content": prompt}], ) return response.choices[0].message.content def _call_google(self, prompt: str, temperature: float) -> str: import google.generativeai as genai api_key = os.environ.get("GEMINI_API_KEY") if not api_key: raise ValueError("GEMINI_API_KEY environment variable is not set") genai.configure(api_key=api_key) model = genai.GenerativeModel(self.model) try: response = model.generate_content( prompt, generation_config=genai.GenerationConfig(temperature=temperature), ) return response.text except Exception as e: raise RuntimeError(f"Gemini API call failed (model={self.model}): {e}") def _extract_json(text: str) -> dict: """Extract JSON from LLM response, handling markdown code fences.""" import re text = text.strip() # Try direct parse first try: return json.loads(text) except json.JSONDecodeError: pass # Remove markdown code fences if "```" in text: # Match ```json ... ``` or ``` ... ``` match = re.search(r"```(?:json)?\s*\n(.*?)\n\s*```", text, re.DOTALL) if match: try: return json.loads(match.group(1).strip()) except json.JSONDecodeError: pass # Fallback: strip first and last fence lines lines = text.split("\n") if lines[0].strip().startswith("```"): lines = lines[1:] if lines and lines[-1].strip() == "```": lines = lines[:-1] try: return json.loads("\n".join(lines)) except json.JSONDecodeError: pass # Last resort: find first { ... } block match = re.search(r"\{.*\}", text, re.DOTALL) if match: try: return json.loads(match.group(0)) except json.JSONDecodeError: pass raise json.JSONDecodeError( f"Could not extract JSON from LLM response (first 200 chars): {text[:200]}", text, 0 ) class FeatureExtractor: """Extracts structured features from processed inputs using LLM.""" def __init__(self, llm_client: Optional[LLMClient] = None): self.llm = llm_client or LLMClient() def extract_role_features(self, processed: ProcessedInput) -> dict: """Extract structured features from job description.""" prompt = ROLE_FEATURE_EXTRACTION_PROMPT.format( job_description=processed.job_description, company_stage=processed.company_context.stage, industry=processed.company_context.industry, compensation_band=processed.company_context.compensation_band, location=processed.company_context.location, remote_type=processed.company_context.remote_type, ) response = self.llm.complete(prompt) return _extract_json(response) def extract_candidate_features(self, processed: ProcessedInput) -> dict: """Extract structured features from resume.""" prompt = CANDIDATE_FEATURE_EXTRACTION_PROMPT.format( resume_text=processed.resume_text, ) response = self.llm.complete(prompt) return _extract_json(response) def analyze_match( self, role_features: dict, candidate_features: dict, processed: ProcessedInput, ) -> dict: """Analyze the match between role and candidate features.""" prompt = MATCH_ANALYSIS_PROMPT.format( role_features=json.dumps(role_features, indent=2), candidate_features=json.dumps(candidate_features, indent=2), company_stage=processed.company_context.stage, industry=processed.company_context.industry, compensation_band=processed.company_context.compensation_band, remote_type=processed.company_context.remote_type, ) response = self.llm.complete(prompt) return _extract_json(response)