SkillForge / backend /agents /extractor.py
team99tech
Added changes
dfb3d07
from langchain_groq import ChatGroq
from langchain_core.messages import SystemMessage, HumanMessage
import json, os
from models.schemas import ExtractionResult
GROQ_MODEL = "llama-3.3-70b-versatile"
EXTRACTION_SYSTEM = """You are a skill extraction engine for a technical interview platform.
Extract structured information from a job description and a candidate resume.
Return ONLY valid JSON matching this exact schema — no markdown fences, no explanation:
{
"jd_skills": [
{
"skill_id": "python",
"label": "Python",
"priority": "high",
"years_required": 3,
"context": "3+ years Python required"
}
],
"resume_skills": [
{
"skill_id": "python",
"label": "Python",
"evidence_strength": 0.7,
"years_mentioned": 2,
"context": "2 years Python scripting"
}
],
"seniority_level": "mid",
"domain": "backend"
}
skill_id rules: lowercase, underscores, no spaces. "GitHub Actions" → "github_actions"
evidence_strength rubric:
0.0 = not mentioned
0.4 = mentioned in passing
0.7 = mentioned with project/years
1.0 = specific metrics or production outcomes
priority: "high" if required/must-have, "medium" if preferred, "low" if nice-to-have
domain: one of backend | data_engineering | ml | devops"""
def get_llm() -> ChatGroq:
return ChatGroq(
model=GROQ_MODEL,
api_key=os.getenv("GROQ_API_KEY", "dummy"),
temperature=0.1,
)
def extract_skills(jd_text: str, resume_text: str) -> ExtractionResult:
llm = get_llm()
messages = [
SystemMessage(content=EXTRACTION_SYSTEM),
HumanMessage(content=f"JD: {jd_text}\n\nResume: {resume_text}")
]
response = llm.invoke(messages)
try:
content = response.content.strip()
if content.startswith("```json"):
content = content[7:]
if content.endswith("```"):
content = content[:-3]
data = json.loads(content)
return ExtractionResult.model_validate(data)
except Exception:
return ExtractionResult(
jd_skills=[],
resume_skills=[],
seniority_level="junior",
domain="backend"
)
def normalize_skill_id(label: str) -> str:
import re
return re.sub(r"[^a-z0-9]+", "_", label.lower()).strip("_")