Spaces:

arifRB
/

gapguide-api

Sleeping

App Files Files Community

gapguide-api / apps /analysis /services.py

arifRB

Deploy GapGuide backend (Docker)

ffd36e0 verified 15 days ago

Raw

History Blame Contribute Delete

17.2 kB

	"""Gap Analysis + Recommendation services.

	Gap analysis — Weighted Competency Fulfillment Algorithm.
	See research/design-decision-changes.md §Module 4 for derivation. Pure functions,
	no DB writes: the report is computed on demand from UserSkill + RoleSkill state.

	Recommendations — composite ranking over the Resource catalog.
	See research/08-next-modules-build-plan.md §Module A. Type weights from
	research/03-gap-analysis-engine-research.md §2.1 are the single source of truth
	for type preference.
	"""
	from __future__ import annotations

	from dataclasses import asdict, dataclass, field
	from typing import Iterable

	from apps.resources.models import Resource, SkillResource
	from apps.roles.models import Role, RoleSkill
	from apps.skills.models import UserSkill
	from apps.skills.utils import proficiency_to_level


	LEVEL_THRESHOLDS: dict[str, int] = {
	'BEGINNER': 40,
	'INTERMEDIATE': 60,
	'ADVANCED': 100,
	}

	MANDATORY_CAP = 60.0

	BANDS: list[tuple[float, str]] = [
	(85.0, 'STRONG'),
	(65.0, 'GOOD'),
	(45.0, 'PARTIAL'),
	(0.0, 'WEAK'),
	]


	@dataclass
	class SkillGap:
	skill_id: int
	skill_name: str
	category: str
	required_level: str
	threshold: int
	user_proficiency: int
	weight: float
	is_mandatory: bool
	gap: int
	satisfaction: float
	gap_type: str
	severity: str \| None

	def to_dict(self) -> dict:
	return asdict(self)


	@dataclass
	class GapReport:
	role_id: int
	role_name: str
	readiness: float
	band: str
	mandatory_cap_applied: bool
	no_requirements: bool
	gaps: list[SkillGap] = field(default_factory=list)

	def to_dict(self) -> dict:
	return {
	'role_id': self.role_id,
	'role_name': self.role_name,
	'readiness': round(self.readiness, 2),
	'band': self.band,
	'mandatory_cap_applied': self.mandatory_cap_applied,
	'mandatory_cap': MANDATORY_CAP,
	'no_requirements': self.no_requirements,
	'gaps': [g.to_dict() for g in self.gaps],
	}


	def _band_for(readiness: float) -> str:
	for floor, name in BANDS:
	if readiness >= floor:
	return name
	return 'WEAK'


	def _classify(
	user_proficiency: int,
	threshold: int,
	is_mandatory: bool,
	) -> tuple[str, str \| None]:
	"""Return (gap_type, severity) per §3 of design-decision-changes.md."""
	if user_proficiency >= threshold:
	return 'MET', None

	if user_proficiency == 0:
	return 'MISSING', 'CRITICAL' if is_mandatory else 'MEDIUM'

	ratio = (threshold - user_proficiency) / threshold
	if is_mandatory or ratio >= 0.5:
	severity = 'HIGH'
	elif ratio >= 0.25:
	severity = 'MEDIUM'
	else:
	severity = 'LOW'
	return 'INSUFFICIENT', severity


	def _user_proficiency_map(user_id: int, skill_ids: Iterable[int]) -> dict[int, int]:
	qs = UserSkill.objects.filter(user_id=user_id, skill_id__in=list(skill_ids))
	return {us.skill_id: us.proficiency for us in qs}


	def compute_gap_report(user, role: Role) -> GapReport:
	role_skills = list(
	RoleSkill.objects.filter(role=role).select_related('skill')
	)

	if not role_skills:
	return GapReport(
	role_id=role.id,
	role_name=role.role_name,
	readiness=100.0,
	band=_band_for(100.0),
	mandatory_cap_applied=False,
	no_requirements=True,
	)

	prof_map = _user_proficiency_map(user.id, (rs.skill_id for rs in role_skills))

	gaps: list[SkillGap] = []
	weighted_sat_sum = 0.0
	weight_sum = 0.0
	mandatory_shortfall = False

	for rs in role_skills:
	threshold = LEVEL_THRESHOLDS[rs.required_level]
	user_prof = prof_map.get(rs.skill_id, 0)
	satisfaction = min(user_prof / threshold, 1.0)
	gap_points = max(threshold - user_prof, 0)
	gap_type, severity = _classify(user_prof, threshold, rs.is_mandatory)

	if rs.is_mandatory and satisfaction < 1.0:
	mandatory_shortfall = True

	weight_sum += rs.weight
	weighted_sat_sum += satisfaction * rs.weight

	gaps.append(SkillGap(
	skill_id=rs.skill_id,
	skill_name=rs.skill.skill_name,
	category=rs.skill.category,
	required_level=rs.required_level,
	threshold=threshold,
	user_proficiency=user_prof,
	weight=rs.weight,
	is_mandatory=rs.is_mandatory,
	gap=gap_points,
	satisfaction=round(satisfaction, 4),
	gap_type=gap_type,
	severity=severity,
	))

	if weight_sum <= 0:
	# No positive weight to aggregate. With the RoleSkill DB CheckConstraints
	# in force a mandatory skill always has weight>0, so weight_sum<=0 implies
	# no mandatory shortfall and the documented zero-weight contract returns
	# 100/no_requirements. The mandatory branch is defense-in-depth against a
	# constraint-bypassing bulk write — it avoids a 0.0/0.0 ZeroDivisionError.
	readiness = 0.0 if mandatory_shortfall else 100.0
	return GapReport(
	role_id=role.id,
	role_name=role.role_name,
	readiness=readiness,
	band=_band_for(readiness),
	# readiness here is 0 (mandatory unmet) or 100 (no requirements);
	# neither is a value "capped down to 60", so the cap is not applied.
	mandatory_cap_applied=False,
	no_requirements=not mandatory_shortfall,
	gaps=gaps,
	)

	raw_readiness = (weighted_sat_sum / weight_sum) * 100.0
	cap_applied = mandatory_shortfall and raw_readiness > MANDATORY_CAP
	readiness = MANDATORY_CAP if cap_applied else raw_readiness
	readiness = max(0.0, min(readiness, 100.0))

	gaps.sort(
	key=lambda g: (
	-{'CRITICAL': 4, 'HIGH': 3, 'MEDIUM': 2, 'LOW': 1, None: 0}[g.severity],
	-g.weight,
	g.skill_name,
	)
	)

	return GapReport(
	role_id=role.id,
	role_name=role.role_name,
	readiness=readiness,
	band=_band_for(readiness),
	mandatory_cap_applied=cap_applied,
	no_requirements=False,
	gaps=gaps,
	)


	# ---------------------------------------------------------------------------
	# Recommendation service
	# ---------------------------------------------------------------------------
	#
	# Recommendations are ranked by a TWO-KEYED order so that material which is
	# clearly the wrong level can never leapfrog appropriately-levelled material on
	# quality (type/relevance/rating) alone:
	#
	# 1. LEVEL TIER (primary) — the distance between the user's CURRENT level for
	# the skill (proficiency_to_level(), NOT the role's required level) and the
	# resource's difficulty. distance 0-1 = "near" (tier 0); distance >=2 =
	# "far" (tier 1). Every near resource ranks above every far one. So a
	# beginner always sees BEGINNER/INTERMEDIATE content before ADVANCED — a
	# hard guarantee, independent of how good the far resource looks otherwise.
	#
	# 2. COMPOSITE QUALITY SCORE (secondary) — orders resources WITHIN a tier:
	#
	# score = TYPE_WEIGHT[type] * w_type
	# + DIFFICULTY_MATCH(user_current_level, difficulty) * w_diff
	# + relevance_score * 10 * w_rel # normalise to 0..10
	# + (rating / 5.0) * 10 * w_rating # normalise to 0..10
	#
	# Within the near tier the difficulty term still gives exact-level a gentle
	# edge over one-level-off, but it stays SOFT: a much more relevant/stronger
	# one-level-off resource can still outrank a weak exact-level one. The only
	# HARD rule is the near-over-far split above.
	#
	# Coefficients (single source of truth — keep in sync with the docstring above):
	#
	# w_type = 1.0 (dominant — follows the type preference from research
	# §2.1 "Video > Course > Article > Docs")
	# w_diff = 1.0
	# w_rel = 1.5 (relevance is heavily weighted — a very-relevant DOCS
	# can outrank a somewhat-relevant VIDEO at the same level)
	# w_rating = 0.5
	#
	# Notable consequences, verified by unit tests:
	# * Equal rating/relevance/difficulty → VIDEO > COURSE > ARTICLE > DOCS.
	# * A resource <=1 level from the user ALWAYS ranks above one >=2 levels away,
	# regardless of type/relevance/rating (the level-tier guarantee).
	# * Within a tier, exact-level beats one-level-off on equal quality, yet a
	# relevance_score=1.0 DOCS can outrank a relevance_score=0.3 VIDEO at the
	# same level (relevance is weighted 1.5x), both at rating 3.5/5:
	# DOCS : 41.0 + 101.0 + (1.010)1.5 + (3.5/510)0.5 = 32.5
	# VIDEO: 101.0 + 101.0 + (0.310)1.5 + (3.5/510)0.5 = 28.0
	# * Stable tie-break by resource_id ASC so API snapshots don't flap.

	TYPE_WEIGHTS: dict[str, int] = {
	'VIDEO': 10,
	'COURSE': 8,
	'ARTICLE': 6,
	'DOCS': 4,
	}

	DIFFICULTY_ORDER: dict[str, int] = {
	'BEGINNER': 0,
	'INTERMEDIATE': 1,
	'ADVANCED': 2,
	}

	W_TYPE = 1.0
	W_DIFF = 1.0
	W_REL = 1.5
	W_RATING = 0.5

	DIFFICULTY_EXACT = 10.0
	DIFFICULTY_ADJACENT = 5.0
	DIFFICULTY_MISMATCH = 0.0

	# A resource within this many levels of the user's current level is "near"
	# (tier 0); anything further is "far" (tier 1) and is ranked below every near
	# resource. With the 3-level scale this means: appropriate-level + one step are
	# near, a two-step mismatch (e.g. ADVANCED for a BEGINNER) is far.
	NEAR_MAX_DISTANCE = 1

	DEFAULT_LIMIT_PER_SKILL = 3

	# Upper bound on resources returned per skill gap. Caps an adversarial
	# ?limit=99999999 so a single request can't try to materialise the whole
	# catalog per skill; the curated catalog is small, so 50 is generous.
	MAX_LIMIT_PER_SKILL = 50


	@dataclass
	class RecommendationItem:
	resource_id: int
	title: str
	provider: str
	url: str
	type: str
	difficulty_level: str
	duration: int
	rating: float
	relevance_score: float
	score: float

	def to_dict(self) -> dict:
	return asdict(self)


	def _difficulty_match_score(anchor_level: str, resource_level: str) -> float:
	anchor = DIFFICULTY_ORDER[anchor_level]
	res = DIFFICULTY_ORDER[resource_level]
	dist = abs(anchor - res)
	if dist == 0:
	return DIFFICULTY_EXACT
	if dist == 1:
	return DIFFICULTY_ADJACENT
	return DIFFICULTY_MISMATCH


	def _difficulty_tier(anchor_level: str, resource_level: str) -> int:
	"""Primary ranking key: 0 if the resource is within NEAR_MAX_DISTANCE levels
	of the user's current level ("near"), else 1 ("far").

	Sorting on this BEFORE the composite score guarantees a clearly-mismatched
	resource (>=2 levels off, e.g. ADVANCED for a beginner) can never outrank an
	appropriately-levelled one on type/relevance/rating alone.
	"""
	dist = abs(DIFFICULTY_ORDER[anchor_level] - DIFFICULTY_ORDER[resource_level])
	return 0 if dist <= NEAR_MAX_DISTANCE else 1


	def _score(
	type_: str,
	anchor_level: str,
	resource_level: str,
	relevance_score: float,
	rating: float,
	) -> float:
	return (
	TYPE_WEIGHTS.get(type_, 0) * W_TYPE
	+ _difficulty_match_score(anchor_level, resource_level) * W_DIFF
	+ (relevance_score * 10.0) * W_REL
	+ (rating / 5.0 * 10.0) * W_RATING
	)


	def compute_recommendations(
	user,
	role: Role,
	limit_per_skill: int = DEFAULT_LIMIT_PER_SKILL,
	) -> dict:
	"""Return {"role_id", "role_name", "recommendations": {skill_id: [...]}}.

	Only returns entries for skills with an actual gap (MISSING or INSUFFICIENT);
	MET skills are skipped. Stable tie-break by resource_id ASC.
	"""
	report = compute_gap_report(user, role)

	gap_skills = [g for g in report.gaps if g.gap_type != 'MET']

	recommendations: dict[int, list[dict]] = {}
	if not gap_skills:
	return {
	'role_id': role.id,
	'role_name': role.role_name,
	'recommendations': recommendations,
	}

	skill_ids = [g.skill_id for g in gap_skills]
	links = (
	SkillResource.objects
	.filter(skill_id__in=skill_ids)
	.select_related('resource')
	)

	by_skill: dict[int, list[tuple[SkillResource, Resource]]] = {
	sid: [] for sid in skill_ids
	}
	for link in links:
	by_skill[link.skill_id].append((link, link.resource))

	# Anchor difficulty matching to the user's CURRENT level for each gap skill
	# (derived from proficiency), not the role's required level — so learners are
	# steered to material at their level first. See the scoring block above.
	anchor_by_skill = {
	g.skill_id: proficiency_to_level(g.user_proficiency) for g in gap_skills
	}

	for skill_id, pairs in by_skill.items():
	anchor_level = anchor_by_skill[skill_id]
	scored: list[tuple[int, float, int, RecommendationItem]] = []
	for link, resource in pairs:
	score = _score(
	resource.type,
	anchor_level,
	resource.difficulty_level,
	link.relevance_score,
	resource.rating,
	)
	tier = _difficulty_tier(anchor_level, resource.difficulty_level)
	scored.append((
	tier,
	score,
	resource.id,
	RecommendationItem(
	resource_id=resource.id,
	title=resource.title,
	provider=resource.provider,
	url=resource.url,
	type=resource.type,
	difficulty_level=resource.difficulty_level,
	duration=resource.duration,
	rating=resource.rating,
	relevance_score=link.relevance_score,
	score=round(score, 4),
	),
	))

	# Primary: level tier (near before far). Secondary: composite score desc.
	# Tertiary: resource_id asc for stable, snapshot-friendly ordering.
	scored.sort(key=lambda t: (t[0], -t[1], t[2]))
	top = [item.to_dict() for _, _, _, item in scored[:limit_per_skill]]
	recommendations[skill_id] = top

	return {
	'role_id': role.id,
	'role_name': role.role_name,
	'recommendations': recommendations,
	}


	# ---------------------------------------------------------------------------
	# Role suggestion service
	# ---------------------------------------------------------------------------
	#
	# Ranks every active role by how well the user's current skills fit it, so the
	# UI can surface "Recommended for you" (top fit) above the rest of the catalog.
	# The fit reflects an uploaded resume automatically: CV → UserSkill → gap
	# analysis is already wired, so suggestions update once parsed skills are saved.
	#
	# Reuses compute_gap_report per role (no duplicated readiness math). For the
	# small curated catalog (~10 roles, ceiling ~50) the ~2 queries/role cost is
	# negligible; batch into a single RoleSkill + UserSkill pass if it ever grows.

	DEFAULT_TOP_MISSING = 3


	def suggest_roles(user, top_missing: int = DEFAULT_TOP_MISSING) -> dict:
	"""Return {"has_skills", "roles": [...]} ranked by skill fit.

	Each role row: role_id, role_name, industry, readiness, band,
	matched_skills/total_skills, top_missing_skills (most-severe first),
	no_requirements. Sorted by (no_requirements last, readiness desc,
	role_name asc) — roles with no requirements report readiness=100
	spuriously, so they sink to the bottom rather than top the list.
	"""
	has_skills = UserSkill.objects.filter(user_id=user.id).exists()

	rows: list[dict] = []
	for role in Role.objects.filter(is_active=True):
	report = compute_gap_report(user, role)
	total = len(report.gaps)
	matched = sum(1 for g in report.gaps if g.gap_type == 'MET')
	# report.gaps is severity-sorted, so the first non-MET names are the
	# most critical missing skills.
	missing = [g.skill_name for g in report.gaps if g.gap_type != 'MET']
	rows.append({
	'role_id': role.id,
	'role_name': role.role_name,
	'industry': role.industry,
	'readiness': round(report.readiness, 2),
	'band': report.band,
	'matched_skills': matched,
	'total_skills': total,
	'top_missing_skills': missing[:top_missing],
	'no_requirements': report.no_requirements,
	})

	rows.sort(key=lambda r: (r['no_requirements'], -r['readiness'], r['role_name']))
	return {'has_skills': has_skills, 'roles': rows}