Spaces:

Niketjain2002
/

recruitment-intelligence

Sleeping

App Files Files Community

recruitment-intelligence / src /feature_extractor.py

Niketjain2002

Switch to gemini-2.5-flash-lite for 10x faster responses

b03837c verified 4 days ago

raw

history blame contribute delete

6.19 kB

	"""
	Feature Extraction Layer

	Uses LLM to extract structured features from raw text inputs.
	Designed as a swappable module: v1 uses LLM, v2 can use fine-tuned models or NER.
	"""

	import json
	import os
	from typing import Optional

	from .input_processor import ProcessedInput
	from .prompts.feature_extraction import (
	ROLE_FEATURE_EXTRACTION_PROMPT,
	CANDIDATE_FEATURE_EXTRACTION_PROMPT,
	MATCH_ANALYSIS_PROMPT,
	)


	class LLMClient:
	"""Abstraction over LLM provider. Swap this for different backends."""

	def __init__(self, provider: str = "anthropic", model: Optional[str] = None):
	self.provider = provider
	self.model = model or self._default_model()

	def _default_model(self) -> str:
	defaults = {
	"anthropic": "claude-sonnet-4-20250514",
	"openai": "gpt-4o",
	"google": "gemini-2.5-flash-lite",
	}
	return defaults.get(self.provider, "claude-sonnet-4-20250514")

	def complete(self, prompt: str, temperature: float = 0.1) -> str:
	"""Send prompt to LLM and return response text."""
	if self.provider == "anthropic":
	return self._call_anthropic(prompt, temperature)
	elif self.provider == "openai":
	return self._call_openai(prompt, temperature)
	elif self.provider == "google":
	return self._call_google(prompt, temperature)
	else:
	raise ValueError(f"Unsupported provider: {self.provider}")

	def _call_anthropic(self, prompt: str, temperature: float) -> str:
	import anthropic

	client = anthropic.Anthropic()
	response = client.messages.create(
	model=self.model,
	max_tokens=4096,
	temperature=temperature,
	messages=[{"role": "user", "content": prompt}],
	)
	return response.content[0].text

	def _call_openai(self, prompt: str, temperature: float) -> str:
	from openai import OpenAI

	client = OpenAI()
	response = client.chat.completions.create(
	model=self.model,
	temperature=temperature,
	messages=[{"role": "user", "content": prompt}],
	)
	return response.choices[0].message.content

	def _call_google(self, prompt: str, temperature: float) -> str:
	import google.generativeai as genai

	api_key = os.environ.get("GEMINI_API_KEY")
	if not api_key:
	raise ValueError("GEMINI_API_KEY environment variable is not set")

	genai.configure(api_key=api_key)
	model = genai.GenerativeModel(self.model)
	try:
	response = model.generate_content(
	prompt,
	generation_config=genai.GenerationConfig(temperature=temperature),
	)
	return response.text
	except Exception as e:
	raise RuntimeError(f"Gemini API call failed (model={self.model}): {e}")


	def _extract_json(text: str) -> dict:
	"""Extract JSON from LLM response, handling markdown code fences."""
	import re

	text = text.strip()

	# Try direct parse first
	try:
	return json.loads(text)
	except json.JSONDecodeError:
	pass

	# Remove markdown code fences
	if "```" in text:
	# Match ```json ... ``` or ``` ... ```
	match = re.search(r"```(?:json)?\s\n(.?)\n\s*```", text, re.DOTALL)
	if match:
	try:
	return json.loads(match.group(1).strip())
	except json.JSONDecodeError:
	pass

	# Fallback: strip first and last fence lines
	lines = text.split("\n")
	if lines[0].strip().startswith("```"):
	lines = lines[1:]
	if lines and lines[-1].strip() == "```":
	lines = lines[:-1]
	try:
	return json.loads("\n".join(lines))
	except json.JSONDecodeError:
	pass

	# Last resort: find first { ... } block
	match = re.search(r"\{.*\}", text, re.DOTALL)
	if match:
	try:
	return json.loads(match.group(0))
	except json.JSONDecodeError:
	pass

	raise json.JSONDecodeError(
	f"Could not extract JSON from LLM response (first 200 chars): {text[:200]}",
	text, 0
	)


	class FeatureExtractor:
	"""Extracts structured features from processed inputs using LLM."""

	def __init__(self, llm_client: Optional[LLMClient] = None):
	self.llm = llm_client or LLMClient()

	def extract_role_features(self, processed: ProcessedInput) -> dict:
	"""Extract structured features from job description."""
	prompt = ROLE_FEATURE_EXTRACTION_PROMPT.format(
	job_description=processed.job_description,
	company_stage=processed.company_context.stage,
	industry=processed.company_context.industry,
	compensation_band=processed.company_context.compensation_band,
	location=processed.company_context.location,
	remote_type=processed.company_context.remote_type,
	)
	response = self.llm.complete(prompt)
	return _extract_json(response)

	def extract_candidate_features(self, processed: ProcessedInput) -> dict:
	"""Extract structured features from resume."""
	prompt = CANDIDATE_FEATURE_EXTRACTION_PROMPT.format(
	resume_text=processed.resume_text,
	)
	response = self.llm.complete(prompt)
	return _extract_json(response)

	def analyze_match(
	self,
	role_features: dict,
	candidate_features: dict,
	processed: ProcessedInput,
	) -> dict:
	"""Analyze the match between role and candidate features."""
	prompt = MATCH_ANALYSIS_PROMPT.format(
	role_features=json.dumps(role_features, indent=2),
	candidate_features=json.dumps(candidate_features, indent=2),
	company_stage=processed.company_context.stage,
	industry=processed.company_context.industry,
	compensation_band=processed.company_context.compensation_band,
	remote_type=processed.company_context.remote_type,
	)
	response = self.llm.complete(prompt)
	return _extract_json(response)