recruitment-intelligence / src /feature_extractor.py
Niketjain2002's picture
Switch to gemini-2.5-flash-lite for 10x faster responses
b03837c verified
"""
Feature Extraction Layer
Uses LLM to extract structured features from raw text inputs.
Designed as a swappable module: v1 uses LLM, v2 can use fine-tuned models or NER.
"""
import json
import os
from typing import Optional
from .input_processor import ProcessedInput
from .prompts.feature_extraction import (
ROLE_FEATURE_EXTRACTION_PROMPT,
CANDIDATE_FEATURE_EXTRACTION_PROMPT,
MATCH_ANALYSIS_PROMPT,
)
class LLMClient:
"""Abstraction over LLM provider. Swap this for different backends."""
def __init__(self, provider: str = "anthropic", model: Optional[str] = None):
self.provider = provider
self.model = model or self._default_model()
def _default_model(self) -> str:
defaults = {
"anthropic": "claude-sonnet-4-20250514",
"openai": "gpt-4o",
"google": "gemini-2.5-flash-lite",
}
return defaults.get(self.provider, "claude-sonnet-4-20250514")
def complete(self, prompt: str, temperature: float = 0.1) -> str:
"""Send prompt to LLM and return response text."""
if self.provider == "anthropic":
return self._call_anthropic(prompt, temperature)
elif self.provider == "openai":
return self._call_openai(prompt, temperature)
elif self.provider == "google":
return self._call_google(prompt, temperature)
else:
raise ValueError(f"Unsupported provider: {self.provider}")
def _call_anthropic(self, prompt: str, temperature: float) -> str:
import anthropic
client = anthropic.Anthropic()
response = client.messages.create(
model=self.model,
max_tokens=4096,
temperature=temperature,
messages=[{"role": "user", "content": prompt}],
)
return response.content[0].text
def _call_openai(self, prompt: str, temperature: float) -> str:
from openai import OpenAI
client = OpenAI()
response = client.chat.completions.create(
model=self.model,
temperature=temperature,
messages=[{"role": "user", "content": prompt}],
)
return response.choices[0].message.content
def _call_google(self, prompt: str, temperature: float) -> str:
import google.generativeai as genai
api_key = os.environ.get("GEMINI_API_KEY")
if not api_key:
raise ValueError("GEMINI_API_KEY environment variable is not set")
genai.configure(api_key=api_key)
model = genai.GenerativeModel(self.model)
try:
response = model.generate_content(
prompt,
generation_config=genai.GenerationConfig(temperature=temperature),
)
return response.text
except Exception as e:
raise RuntimeError(f"Gemini API call failed (model={self.model}): {e}")
def _extract_json(text: str) -> dict:
"""Extract JSON from LLM response, handling markdown code fences."""
import re
text = text.strip()
# Try direct parse first
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Remove markdown code fences
if "```" in text:
# Match ```json ... ``` or ``` ... ```
match = re.search(r"```(?:json)?\s*\n(.*?)\n\s*```", text, re.DOTALL)
if match:
try:
return json.loads(match.group(1).strip())
except json.JSONDecodeError:
pass
# Fallback: strip first and last fence lines
lines = text.split("\n")
if lines[0].strip().startswith("```"):
lines = lines[1:]
if lines and lines[-1].strip() == "```":
lines = lines[:-1]
try:
return json.loads("\n".join(lines))
except json.JSONDecodeError:
pass
# Last resort: find first { ... } block
match = re.search(r"\{.*\}", text, re.DOTALL)
if match:
try:
return json.loads(match.group(0))
except json.JSONDecodeError:
pass
raise json.JSONDecodeError(
f"Could not extract JSON from LLM response (first 200 chars): {text[:200]}",
text, 0
)
class FeatureExtractor:
"""Extracts structured features from processed inputs using LLM."""
def __init__(self, llm_client: Optional[LLMClient] = None):
self.llm = llm_client or LLMClient()
def extract_role_features(self, processed: ProcessedInput) -> dict:
"""Extract structured features from job description."""
prompt = ROLE_FEATURE_EXTRACTION_PROMPT.format(
job_description=processed.job_description,
company_stage=processed.company_context.stage,
industry=processed.company_context.industry,
compensation_band=processed.company_context.compensation_band,
location=processed.company_context.location,
remote_type=processed.company_context.remote_type,
)
response = self.llm.complete(prompt)
return _extract_json(response)
def extract_candidate_features(self, processed: ProcessedInput) -> dict:
"""Extract structured features from resume."""
prompt = CANDIDATE_FEATURE_EXTRACTION_PROMPT.format(
resume_text=processed.resume_text,
)
response = self.llm.complete(prompt)
return _extract_json(response)
def analyze_match(
self,
role_features: dict,
candidate_features: dict,
processed: ProcessedInput,
) -> dict:
"""Analyze the match between role and candidate features."""
prompt = MATCH_ANALYSIS_PROMPT.format(
role_features=json.dumps(role_features, indent=2),
candidate_features=json.dumps(candidate_features, indent=2),
company_stage=processed.company_context.stage,
industry=processed.company_context.industry,
compensation_band=processed.company_context.compensation_band,
remote_type=processed.company_context.remote_type,
)
response = self.llm.complete(prompt)
return _extract_json(response)