gapguide-api / apps /analysis /services.py
arifRB's picture
Deploy GapGuide backend (Docker)
ffd36e0 verified
Raw
History Blame Contribute Delete
17.2 kB
"""Gap Analysis + Recommendation services.
Gap analysis — Weighted Competency Fulfillment Algorithm.
See research/design-decision-changes.md §Module 4 for derivation. Pure functions,
no DB writes: the report is computed on demand from UserSkill + RoleSkill state.
Recommendations — composite ranking over the Resource catalog.
See research/08-next-modules-build-plan.md §Module A. Type weights from
research/03-gap-analysis-engine-research.md §2.1 are the single source of truth
for type preference.
"""
from __future__ import annotations
from dataclasses import asdict, dataclass, field
from typing import Iterable
from apps.resources.models import Resource, SkillResource
from apps.roles.models import Role, RoleSkill
from apps.skills.models import UserSkill
from apps.skills.utils import proficiency_to_level
LEVEL_THRESHOLDS: dict[str, int] = {
'BEGINNER': 40,
'INTERMEDIATE': 60,
'ADVANCED': 100,
}
MANDATORY_CAP = 60.0
BANDS: list[tuple[float, str]] = [
(85.0, 'STRONG'),
(65.0, 'GOOD'),
(45.0, 'PARTIAL'),
(0.0, 'WEAK'),
]
@dataclass
class SkillGap:
skill_id: int
skill_name: str
category: str
required_level: str
threshold: int
user_proficiency: int
weight: float
is_mandatory: bool
gap: int
satisfaction: float
gap_type: str
severity: str | None
def to_dict(self) -> dict:
return asdict(self)
@dataclass
class GapReport:
role_id: int
role_name: str
readiness: float
band: str
mandatory_cap_applied: bool
no_requirements: bool
gaps: list[SkillGap] = field(default_factory=list)
def to_dict(self) -> dict:
return {
'role_id': self.role_id,
'role_name': self.role_name,
'readiness': round(self.readiness, 2),
'band': self.band,
'mandatory_cap_applied': self.mandatory_cap_applied,
'mandatory_cap': MANDATORY_CAP,
'no_requirements': self.no_requirements,
'gaps': [g.to_dict() for g in self.gaps],
}
def _band_for(readiness: float) -> str:
for floor, name in BANDS:
if readiness >= floor:
return name
return 'WEAK'
def _classify(
user_proficiency: int,
threshold: int,
is_mandatory: bool,
) -> tuple[str, str | None]:
"""Return (gap_type, severity) per §3 of design-decision-changes.md."""
if user_proficiency >= threshold:
return 'MET', None
if user_proficiency == 0:
return 'MISSING', 'CRITICAL' if is_mandatory else 'MEDIUM'
ratio = (threshold - user_proficiency) / threshold
if is_mandatory or ratio >= 0.5:
severity = 'HIGH'
elif ratio >= 0.25:
severity = 'MEDIUM'
else:
severity = 'LOW'
return 'INSUFFICIENT', severity
def _user_proficiency_map(user_id: int, skill_ids: Iterable[int]) -> dict[int, int]:
qs = UserSkill.objects.filter(user_id=user_id, skill_id__in=list(skill_ids))
return {us.skill_id: us.proficiency for us in qs}
def compute_gap_report(user, role: Role) -> GapReport:
role_skills = list(
RoleSkill.objects.filter(role=role).select_related('skill')
)
if not role_skills:
return GapReport(
role_id=role.id,
role_name=role.role_name,
readiness=100.0,
band=_band_for(100.0),
mandatory_cap_applied=False,
no_requirements=True,
)
prof_map = _user_proficiency_map(user.id, (rs.skill_id for rs in role_skills))
gaps: list[SkillGap] = []
weighted_sat_sum = 0.0
weight_sum = 0.0
mandatory_shortfall = False
for rs in role_skills:
threshold = LEVEL_THRESHOLDS[rs.required_level]
user_prof = prof_map.get(rs.skill_id, 0)
satisfaction = min(user_prof / threshold, 1.0)
gap_points = max(threshold - user_prof, 0)
gap_type, severity = _classify(user_prof, threshold, rs.is_mandatory)
if rs.is_mandatory and satisfaction < 1.0:
mandatory_shortfall = True
weight_sum += rs.weight
weighted_sat_sum += satisfaction * rs.weight
gaps.append(SkillGap(
skill_id=rs.skill_id,
skill_name=rs.skill.skill_name,
category=rs.skill.category,
required_level=rs.required_level,
threshold=threshold,
user_proficiency=user_prof,
weight=rs.weight,
is_mandatory=rs.is_mandatory,
gap=gap_points,
satisfaction=round(satisfaction, 4),
gap_type=gap_type,
severity=severity,
))
if weight_sum <= 0:
# No positive weight to aggregate. With the RoleSkill DB CheckConstraints
# in force a mandatory skill always has weight>0, so weight_sum<=0 implies
# no mandatory shortfall and the documented zero-weight contract returns
# 100/no_requirements. The mandatory branch is defense-in-depth against a
# constraint-bypassing bulk write — it avoids a 0.0/0.0 ZeroDivisionError.
readiness = 0.0 if mandatory_shortfall else 100.0
return GapReport(
role_id=role.id,
role_name=role.role_name,
readiness=readiness,
band=_band_for(readiness),
# readiness here is 0 (mandatory unmet) or 100 (no requirements);
# neither is a value "capped down to 60", so the cap is not applied.
mandatory_cap_applied=False,
no_requirements=not mandatory_shortfall,
gaps=gaps,
)
raw_readiness = (weighted_sat_sum / weight_sum) * 100.0
cap_applied = mandatory_shortfall and raw_readiness > MANDATORY_CAP
readiness = MANDATORY_CAP if cap_applied else raw_readiness
readiness = max(0.0, min(readiness, 100.0))
gaps.sort(
key=lambda g: (
-{'CRITICAL': 4, 'HIGH': 3, 'MEDIUM': 2, 'LOW': 1, None: 0}[g.severity],
-g.weight,
g.skill_name,
)
)
return GapReport(
role_id=role.id,
role_name=role.role_name,
readiness=readiness,
band=_band_for(readiness),
mandatory_cap_applied=cap_applied,
no_requirements=False,
gaps=gaps,
)
# ---------------------------------------------------------------------------
# Recommendation service
# ---------------------------------------------------------------------------
#
# Recommendations are ranked by a TWO-KEYED order so that material which is
# clearly the wrong level can never leapfrog appropriately-levelled material on
# quality (type/relevance/rating) alone:
#
# 1. LEVEL TIER (primary) — the distance between the user's CURRENT level for
# the skill (proficiency_to_level(), NOT the role's required level) and the
# resource's difficulty. distance 0-1 = "near" (tier 0); distance >=2 =
# "far" (tier 1). Every near resource ranks above every far one. So a
# beginner always sees BEGINNER/INTERMEDIATE content before ADVANCED — a
# hard guarantee, independent of how good the far resource looks otherwise.
#
# 2. COMPOSITE QUALITY SCORE (secondary) — orders resources WITHIN a tier:
#
# score = TYPE_WEIGHT[type] * w_type
# + DIFFICULTY_MATCH(user_current_level, difficulty) * w_diff
# + relevance_score * 10 * w_rel # normalise to 0..10
# + (rating / 5.0) * 10 * w_rating # normalise to 0..10
#
# Within the near tier the difficulty term still gives exact-level a gentle
# edge over one-level-off, but it stays SOFT: a much more relevant/stronger
# one-level-off resource can still outrank a weak exact-level one. The only
# HARD rule is the near-over-far split above.
#
# Coefficients (single source of truth — keep in sync with the docstring above):
#
# w_type = 1.0 (dominant — follows the type preference from research
# §2.1 "Video > Course > Article > Docs")
# w_diff = 1.0
# w_rel = 1.5 (relevance is heavily weighted — a very-relevant DOCS
# can outrank a somewhat-relevant VIDEO at the same level)
# w_rating = 0.5
#
# Notable consequences, verified by unit tests:
# * Equal rating/relevance/difficulty → VIDEO > COURSE > ARTICLE > DOCS.
# * A resource <=1 level from the user ALWAYS ranks above one >=2 levels away,
# regardless of type/relevance/rating (the level-tier guarantee).
# * Within a tier, exact-level beats one-level-off on equal quality, yet a
# relevance_score=1.0 DOCS can outrank a relevance_score=0.3 VIDEO at the
# same level (relevance is weighted 1.5x), both at rating 3.5/5:
# DOCS : 4*1.0 + 10*1.0 + (1.0*10)*1.5 + (3.5/5*10)*0.5 = 32.5
# VIDEO: 10*1.0 + 10*1.0 + (0.3*10)*1.5 + (3.5/5*10)*0.5 = 28.0
# * Stable tie-break by resource_id ASC so API snapshots don't flap.
TYPE_WEIGHTS: dict[str, int] = {
'VIDEO': 10,
'COURSE': 8,
'ARTICLE': 6,
'DOCS': 4,
}
DIFFICULTY_ORDER: dict[str, int] = {
'BEGINNER': 0,
'INTERMEDIATE': 1,
'ADVANCED': 2,
}
W_TYPE = 1.0
W_DIFF = 1.0
W_REL = 1.5
W_RATING = 0.5
DIFFICULTY_EXACT = 10.0
DIFFICULTY_ADJACENT = 5.0
DIFFICULTY_MISMATCH = 0.0
# A resource within this many levels of the user's current level is "near"
# (tier 0); anything further is "far" (tier 1) and is ranked below every near
# resource. With the 3-level scale this means: appropriate-level + one step are
# near, a two-step mismatch (e.g. ADVANCED for a BEGINNER) is far.
NEAR_MAX_DISTANCE = 1
DEFAULT_LIMIT_PER_SKILL = 3
# Upper bound on resources returned per skill gap. Caps an adversarial
# ?limit=99999999 so a single request can't try to materialise the whole
# catalog per skill; the curated catalog is small, so 50 is generous.
MAX_LIMIT_PER_SKILL = 50
@dataclass
class RecommendationItem:
resource_id: int
title: str
provider: str
url: str
type: str
difficulty_level: str
duration: int
rating: float
relevance_score: float
score: float
def to_dict(self) -> dict:
return asdict(self)
def _difficulty_match_score(anchor_level: str, resource_level: str) -> float:
anchor = DIFFICULTY_ORDER[anchor_level]
res = DIFFICULTY_ORDER[resource_level]
dist = abs(anchor - res)
if dist == 0:
return DIFFICULTY_EXACT
if dist == 1:
return DIFFICULTY_ADJACENT
return DIFFICULTY_MISMATCH
def _difficulty_tier(anchor_level: str, resource_level: str) -> int:
"""Primary ranking key: 0 if the resource is within NEAR_MAX_DISTANCE levels
of the user's current level ("near"), else 1 ("far").
Sorting on this BEFORE the composite score guarantees a clearly-mismatched
resource (>=2 levels off, e.g. ADVANCED for a beginner) can never outrank an
appropriately-levelled one on type/relevance/rating alone.
"""
dist = abs(DIFFICULTY_ORDER[anchor_level] - DIFFICULTY_ORDER[resource_level])
return 0 if dist <= NEAR_MAX_DISTANCE else 1
def _score(
type_: str,
anchor_level: str,
resource_level: str,
relevance_score: float,
rating: float,
) -> float:
return (
TYPE_WEIGHTS.get(type_, 0) * W_TYPE
+ _difficulty_match_score(anchor_level, resource_level) * W_DIFF
+ (relevance_score * 10.0) * W_REL
+ (rating / 5.0 * 10.0) * W_RATING
)
def compute_recommendations(
user,
role: Role,
limit_per_skill: int = DEFAULT_LIMIT_PER_SKILL,
) -> dict:
"""Return {"role_id", "role_name", "recommendations": {skill_id: [...]}}.
Only returns entries for skills with an actual gap (MISSING or INSUFFICIENT);
MET skills are skipped. Stable tie-break by resource_id ASC.
"""
report = compute_gap_report(user, role)
gap_skills = [g for g in report.gaps if g.gap_type != 'MET']
recommendations: dict[int, list[dict]] = {}
if not gap_skills:
return {
'role_id': role.id,
'role_name': role.role_name,
'recommendations': recommendations,
}
skill_ids = [g.skill_id for g in gap_skills]
links = (
SkillResource.objects
.filter(skill_id__in=skill_ids)
.select_related('resource')
)
by_skill: dict[int, list[tuple[SkillResource, Resource]]] = {
sid: [] for sid in skill_ids
}
for link in links:
by_skill[link.skill_id].append((link, link.resource))
# Anchor difficulty matching to the user's CURRENT level for each gap skill
# (derived from proficiency), not the role's required level — so learners are
# steered to material at their level first. See the scoring block above.
anchor_by_skill = {
g.skill_id: proficiency_to_level(g.user_proficiency) for g in gap_skills
}
for skill_id, pairs in by_skill.items():
anchor_level = anchor_by_skill[skill_id]
scored: list[tuple[int, float, int, RecommendationItem]] = []
for link, resource in pairs:
score = _score(
resource.type,
anchor_level,
resource.difficulty_level,
link.relevance_score,
resource.rating,
)
tier = _difficulty_tier(anchor_level, resource.difficulty_level)
scored.append((
tier,
score,
resource.id,
RecommendationItem(
resource_id=resource.id,
title=resource.title,
provider=resource.provider,
url=resource.url,
type=resource.type,
difficulty_level=resource.difficulty_level,
duration=resource.duration,
rating=resource.rating,
relevance_score=link.relevance_score,
score=round(score, 4),
),
))
# Primary: level tier (near before far). Secondary: composite score desc.
# Tertiary: resource_id asc for stable, snapshot-friendly ordering.
scored.sort(key=lambda t: (t[0], -t[1], t[2]))
top = [item.to_dict() for _, _, _, item in scored[:limit_per_skill]]
recommendations[skill_id] = top
return {
'role_id': role.id,
'role_name': role.role_name,
'recommendations': recommendations,
}
# ---------------------------------------------------------------------------
# Role suggestion service
# ---------------------------------------------------------------------------
#
# Ranks every active role by how well the user's current skills fit it, so the
# UI can surface "Recommended for you" (top fit) above the rest of the catalog.
# The fit reflects an uploaded resume automatically: CV → UserSkill → gap
# analysis is already wired, so suggestions update once parsed skills are saved.
#
# Reuses compute_gap_report per role (no duplicated readiness math). For the
# small curated catalog (~10 roles, ceiling ~50) the ~2 queries/role cost is
# negligible; batch into a single RoleSkill + UserSkill pass if it ever grows.
DEFAULT_TOP_MISSING = 3
def suggest_roles(user, top_missing: int = DEFAULT_TOP_MISSING) -> dict:
"""Return {"has_skills", "roles": [...]} ranked by skill fit.
Each role row: role_id, role_name, industry, readiness, band,
matched_skills/total_skills, top_missing_skills (most-severe first),
no_requirements. Sorted by (no_requirements last, readiness desc,
role_name asc) — roles with no requirements report readiness=100
spuriously, so they sink to the bottom rather than top the list.
"""
has_skills = UserSkill.objects.filter(user_id=user.id).exists()
rows: list[dict] = []
for role in Role.objects.filter(is_active=True):
report = compute_gap_report(user, role)
total = len(report.gaps)
matched = sum(1 for g in report.gaps if g.gap_type == 'MET')
# report.gaps is severity-sorted, so the first non-MET names are the
# most critical missing skills.
missing = [g.skill_name for g in report.gaps if g.gap_type != 'MET']
rows.append({
'role_id': role.id,
'role_name': role.role_name,
'industry': role.industry,
'readiness': round(report.readiness, 2),
'band': report.band,
'matched_skills': matched,
'total_skills': total,
'top_missing_skills': missing[:top_missing],
'no_requirements': report.no_requirements,
})
rows.sort(key=lambda r: (r['no_requirements'], -r['readiness'], r['role_name']))
return {'has_skills': has_skills, 'roles': rows}