File size: 6,187 Bytes
ef92999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b03837c
ef92999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97adab6
 
 
 
 
ef92999
97adab6
 
 
 
 
 
 
 
ef92999
 
 
 
97adab6
 
ef92999
97adab6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef92999
97adab6
 
ef92999
 
97adab6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef92999
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""
Feature Extraction Layer

Uses LLM to extract structured features from raw text inputs.
Designed as a swappable module: v1 uses LLM, v2 can use fine-tuned models or NER.
"""

import json
import os
from typing import Optional

from .input_processor import ProcessedInput
from .prompts.feature_extraction import (
    ROLE_FEATURE_EXTRACTION_PROMPT,
    CANDIDATE_FEATURE_EXTRACTION_PROMPT,
    MATCH_ANALYSIS_PROMPT,
)


class LLMClient:
    """Abstraction over LLM provider. Swap this for different backends."""

    def __init__(self, provider: str = "anthropic", model: Optional[str] = None):
        self.provider = provider
        self.model = model or self._default_model()

    def _default_model(self) -> str:
        defaults = {
            "anthropic": "claude-sonnet-4-20250514",
            "openai": "gpt-4o",
            "google": "gemini-2.5-flash-lite",
        }
        return defaults.get(self.provider, "claude-sonnet-4-20250514")

    def complete(self, prompt: str, temperature: float = 0.1) -> str:
        """Send prompt to LLM and return response text."""
        if self.provider == "anthropic":
            return self._call_anthropic(prompt, temperature)
        elif self.provider == "openai":
            return self._call_openai(prompt, temperature)
        elif self.provider == "google":
            return self._call_google(prompt, temperature)
        else:
            raise ValueError(f"Unsupported provider: {self.provider}")

    def _call_anthropic(self, prompt: str, temperature: float) -> str:
        import anthropic

        client = anthropic.Anthropic()
        response = client.messages.create(
            model=self.model,
            max_tokens=4096,
            temperature=temperature,
            messages=[{"role": "user", "content": prompt}],
        )
        return response.content[0].text

    def _call_openai(self, prompt: str, temperature: float) -> str:
        from openai import OpenAI

        client = OpenAI()
        response = client.chat.completions.create(
            model=self.model,
            temperature=temperature,
            messages=[{"role": "user", "content": prompt}],
        )
        return response.choices[0].message.content

    def _call_google(self, prompt: str, temperature: float) -> str:
        import google.generativeai as genai

        api_key = os.environ.get("GEMINI_API_KEY")
        if not api_key:
            raise ValueError("GEMINI_API_KEY environment variable is not set")

        genai.configure(api_key=api_key)
        model = genai.GenerativeModel(self.model)
        try:
            response = model.generate_content(
                prompt,
                generation_config=genai.GenerationConfig(temperature=temperature),
            )
            return response.text
        except Exception as e:
            raise RuntimeError(f"Gemini API call failed (model={self.model}): {e}")


def _extract_json(text: str) -> dict:
    """Extract JSON from LLM response, handling markdown code fences."""
    import re

    text = text.strip()

    # Try direct parse first
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Remove markdown code fences
    if "```" in text:
        # Match ```json ... ``` or ``` ... ```
        match = re.search(r"```(?:json)?\s*\n(.*?)\n\s*```", text, re.DOTALL)
        if match:
            try:
                return json.loads(match.group(1).strip())
            except json.JSONDecodeError:
                pass

        # Fallback: strip first and last fence lines
        lines = text.split("\n")
        if lines[0].strip().startswith("```"):
            lines = lines[1:]
        if lines and lines[-1].strip() == "```":
            lines = lines[:-1]
        try:
            return json.loads("\n".join(lines))
        except json.JSONDecodeError:
            pass

    # Last resort: find first { ... } block
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if match:
        try:
            return json.loads(match.group(0))
        except json.JSONDecodeError:
            pass

    raise json.JSONDecodeError(
        f"Could not extract JSON from LLM response (first 200 chars): {text[:200]}",
        text, 0
    )


class FeatureExtractor:
    """Extracts structured features from processed inputs using LLM."""

    def __init__(self, llm_client: Optional[LLMClient] = None):
        self.llm = llm_client or LLMClient()

    def extract_role_features(self, processed: ProcessedInput) -> dict:
        """Extract structured features from job description."""
        prompt = ROLE_FEATURE_EXTRACTION_PROMPT.format(
            job_description=processed.job_description,
            company_stage=processed.company_context.stage,
            industry=processed.company_context.industry,
            compensation_band=processed.company_context.compensation_band,
            location=processed.company_context.location,
            remote_type=processed.company_context.remote_type,
        )
        response = self.llm.complete(prompt)
        return _extract_json(response)

    def extract_candidate_features(self, processed: ProcessedInput) -> dict:
        """Extract structured features from resume."""
        prompt = CANDIDATE_FEATURE_EXTRACTION_PROMPT.format(
            resume_text=processed.resume_text,
        )
        response = self.llm.complete(prompt)
        return _extract_json(response)

    def analyze_match(
        self,
        role_features: dict,
        candidate_features: dict,
        processed: ProcessedInput,
    ) -> dict:
        """Analyze the match between role and candidate features."""
        prompt = MATCH_ANALYSIS_PROMPT.format(
            role_features=json.dumps(role_features, indent=2),
            candidate_features=json.dumps(candidate_features, indent=2),
            company_stage=processed.company_context.stage,
            industry=processed.company_context.industry,
            compensation_band=processed.company_context.compensation_band,
            remote_type=processed.company_context.remote_type,
        )
        response = self.llm.complete(prompt)
        return _extract_json(response)