Spaces:

Niketjain2002
/

recruitment-intelligence

Sleeping

File size: 6,187 Bytes

"""
Feature Extraction Layer

Uses LLM to extract structured features from raw text inputs.
Designed as a swappable module: v1 uses LLM, v2 can use fine-tuned models or NER.
"""

import json
import os
from typing import Optional

from .input_processor import ProcessedInput
from .prompts.feature_extraction import (
    ROLE_FEATURE_EXTRACTION_PROMPT,
    CANDIDATE_FEATURE_EXTRACTION_PROMPT,
    MATCH_ANALYSIS_PROMPT,
)


class LLMClient:
    """Abstraction over LLM provider. Swap this for different backends."""

    def __init__(self, provider: str = "anthropic", model: Optional[str] = None):
        self.provider = provider
        self.model = model or self._default_model()

    def _default_model(self) -> str:
        defaults = {
            "anthropic": "claude-sonnet-4-20250514",
            "openai": "gpt-4o",
            "google": "gemini-2.5-flash-lite",
        }
        return defaults.get(self.provider, "claude-sonnet-4-20250514")

    def complete(self, prompt: str, temperature: float = 0.1) -> str:
        """Send prompt to LLM and return response text."""
        if self.provider == "anthropic":
            return self._call_anthropic(prompt, temperature)
        elif self.provider == "openai":
            return self._call_openai(prompt, temperature)
        elif self.provider == "google":
            return self._call_google(prompt, temperature)
        else:
            raise ValueError(f"Unsupported provider: {self.provider}")

    def _call_anthropic(self, prompt: str, temperature: float) -> str:
        import anthropic

        client = anthropic.Anthropic()
        response = client.messages.create(
            model=self.model,
            max_tokens=4096,
            temperature=temperature,
            messages=[{"role": "user", "content": prompt}],
        )
        return response.content[0].text

    def _call_openai(self, prompt: str, temperature: float) -> str:
        from openai import OpenAI

        client = OpenAI()
        response = client.chat.completions.create(
            model=self.model,
            temperature=temperature,
            messages=[{"role": "user", "content": prompt}],
        )
        return response.choices[0].message.content

    def _call_google(self, prompt: str, temperature: float) -> str:
        import google.generativeai as genai

        api_key = os.environ.get("GEMINI_API_KEY")
        if not api_key:
            raise ValueError("GEMINI_API_KEY environment variable is not set")

        genai.configure(api_key=api_key)
        model = genai.GenerativeModel(self.model)
        try:
            response = model.generate_content(
                prompt,
                generation_config=genai.GenerationConfig(temperature=temperature),
            )
            return response.text
        except Exception as e:
            raise RuntimeError(f"Gemini API call failed (model={self.model}): {e}")


def _extract_json(text: str) -> dict:
    """Extract JSON from LLM response, handling markdown code fences."""
    import re

    text = text.strip()

    # Try direct parse first
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Remove markdown code fences
    if "```" in text:
        # Match ```json ... ``` or ``` ... ```
        match = re.search(r"```(?:json)?\s*\n(.*?)\n\s*```", text, re.DOTALL)
        if match:
            try:
                return json.loads(match.group(1).strip())
            except json.JSONDecodeError:
                pass

        # Fallback: strip first and last fence lines
        lines = text.split("\n")
        if lines[0].strip().startswith("```"):
            lines = lines[1:]
        if lines and lines[-1].strip() == "```":
            lines = lines[:-1]
        try:
            return json.loads("\n".join(lines))
        except json.JSONDecodeError:
            pass

    # Last resort: find first { ... } block
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(0))
        except json.JSONDecodeError:
            pass

    raise json.JSONDecodeError(
        f"Could not extract JSON from LLM response (first 200 chars): {text[:200]}",
        text, 0
    )


class FeatureExtractor:
    """Extracts structured features from processed inputs using LLM."""

    def __init__(self, llm_client: Optional[LLMClient] = None):
        self.llm = llm_client or LLMClient()

    def extract_role_features(self, processed: ProcessedInput) -> dict:
        """Extract structured features from job description."""
        prompt = ROLE_FEATURE_EXTRACTION_PROMPT.format(
            job_description=processed.job_description,
            company_stage=processed.company_context.stage,
            industry=processed.company_context.industry,
            compensation_band=processed.company_context.compensation_band,
            location=processed.company_context.location,
            remote_type=processed.company_context.remote_type,
        )
        response = self.llm.complete(prompt)
        return _extract_json(response)

    def extract_candidate_features(self, processed: ProcessedInput) -> dict:
        """Extract structured features from resume."""
        prompt = CANDIDATE_FEATURE_EXTRACTION_PROMPT.format(
            resume_text=processed.resume_text,
        )
        response = self.llm.complete(prompt)
        return _extract_json(response)

    def analyze_match(
        self,
        role_features: dict,
        candidate_features: dict,
        processed: ProcessedInput,
    ) -> dict:
        """Analyze the match between role and candidate features."""
        prompt = MATCH_ANALYSIS_PROMPT.format(
            role_features=json.dumps(role_features, indent=2),
            candidate_features=json.dumps(candidate_features, indent=2),
            company_stage=processed.company_context.stage,
            industry=processed.company_context.industry,
            compensation_band=processed.company_context.compensation_band,
            remote_type=processed.company_context.remote_type,
        )
        response = self.llm.complete(prompt)
        return _extract_json(response)