File size: 6,187 Bytes
ef92999 b03837c ef92999 97adab6 ef92999 97adab6 ef92999 97adab6 ef92999 97adab6 ef92999 97adab6 ef92999 97adab6 ef92999 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | """
Feature Extraction Layer
Uses LLM to extract structured features from raw text inputs.
Designed as a swappable module: v1 uses LLM, v2 can use fine-tuned models or NER.
"""
import json
import os
from typing import Optional
from .input_processor import ProcessedInput
from .prompts.feature_extraction import (
ROLE_FEATURE_EXTRACTION_PROMPT,
CANDIDATE_FEATURE_EXTRACTION_PROMPT,
MATCH_ANALYSIS_PROMPT,
)
class LLMClient:
"""Abstraction over LLM provider. Swap this for different backends."""
def __init__(self, provider: str = "anthropic", model: Optional[str] = None):
self.provider = provider
self.model = model or self._default_model()
def _default_model(self) -> str:
defaults = {
"anthropic": "claude-sonnet-4-20250514",
"openai": "gpt-4o",
"google": "gemini-2.5-flash-lite",
}
return defaults.get(self.provider, "claude-sonnet-4-20250514")
def complete(self, prompt: str, temperature: float = 0.1) -> str:
"""Send prompt to LLM and return response text."""
if self.provider == "anthropic":
return self._call_anthropic(prompt, temperature)
elif self.provider == "openai":
return self._call_openai(prompt, temperature)
elif self.provider == "google":
return self._call_google(prompt, temperature)
else:
raise ValueError(f"Unsupported provider: {self.provider}")
def _call_anthropic(self, prompt: str, temperature: float) -> str:
import anthropic
client = anthropic.Anthropic()
response = client.messages.create(
model=self.model,
max_tokens=4096,
temperature=temperature,
messages=[{"role": "user", "content": prompt}],
)
return response.content[0].text
def _call_openai(self, prompt: str, temperature: float) -> str:
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model=self.model,
temperature=temperature,
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content
def _call_google(self, prompt: str, temperature: float) -> str:
import google.generativeai as genai
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
raise ValueError("GEMINI_API_KEY environment variable is not set")
genai.configure(api_key=api_key)
model = genai.GenerativeModel(self.model)
try:
response = model.generate_content(
prompt,
generation_config=genai.GenerationConfig(temperature=temperature),
)
return response.text
except Exception as e:
raise RuntimeError(f"Gemini API call failed (model={self.model}): {e}")
def _extract_json(text: str) -> dict:
"""Extract JSON from LLM response, handling markdown code fences."""
import re
text = text.strip()
# Try direct parse first
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Remove markdown code fences
if "```" in text:
# Match ```json ... ``` or ``` ... ```
match = re.search(r"```(?:json)?\s*\n(.*?)\n\s*```", text, re.DOTALL)
if match:
try:
return json.loads(match.group(1).strip())
except json.JSONDecodeError:
pass
# Fallback: strip first and last fence lines
lines = text.split("\n")
if lines[0].strip().startswith("```"):
lines = lines[1:]
if lines and lines[-1].strip() == "```":
lines = lines[:-1]
try:
return json.loads("\n".join(lines))
except json.JSONDecodeError:
pass
# Last resort: find first { ... } block
match = re.search(r"\{.*\}", text, re.DOTALL)
if match:
try:
return json.loads(match.group(0))
except json.JSONDecodeError:
pass
raise json.JSONDecodeError(
f"Could not extract JSON from LLM response (first 200 chars): {text[:200]}",
text, 0
)
class FeatureExtractor:
"""Extracts structured features from processed inputs using LLM."""
def __init__(self, llm_client: Optional[LLMClient] = None):
self.llm = llm_client or LLMClient()
def extract_role_features(self, processed: ProcessedInput) -> dict:
"""Extract structured features from job description."""
prompt = ROLE_FEATURE_EXTRACTION_PROMPT.format(
job_description=processed.job_description,
company_stage=processed.company_context.stage,
industry=processed.company_context.industry,
compensation_band=processed.company_context.compensation_band,
location=processed.company_context.location,
remote_type=processed.company_context.remote_type,
)
response = self.llm.complete(prompt)
return _extract_json(response)
def extract_candidate_features(self, processed: ProcessedInput) -> dict:
"""Extract structured features from resume."""
prompt = CANDIDATE_FEATURE_EXTRACTION_PROMPT.format(
resume_text=processed.resume_text,
)
response = self.llm.complete(prompt)
return _extract_json(response)
def analyze_match(
self,
role_features: dict,
candidate_features: dict,
processed: ProcessedInput,
) -> dict:
"""Analyze the match between role and candidate features."""
prompt = MATCH_ANALYSIS_PROMPT.format(
role_features=json.dumps(role_features, indent=2),
candidate_features=json.dumps(candidate_features, indent=2),
company_stage=processed.company_context.stage,
industry=processed.company_context.industry,
compensation_band=processed.company_context.compensation_band,
remote_type=processed.company_context.remote_type,
)
response = self.llm.complete(prompt)
return _extract_json(response)
|