from __future__ import annotations from collections import defaultdict import math from typing import Any from app.ai.embeddings import CodeEmbeddingService from app.ai.scoring import ScoringEngine from app.clients.github_graphql import GitHubGraphQLClient from app.core.config import settings from app.clients.streak import compute_streak_from_calendar from app.graph.state import ProfileState def _extract_features(user: dict[str, Any]) -> tuple[dict[str, int], dict[str, int], list[str]]: repositories = user.get("repositories", {}).get("nodes", []) language_sizes: dict[str, int] = defaultdict(int) repo_snippets: list[str] = [] total_commits = 0 total_stars = 0 total_forks = 0 for repo in repositories: name = repo.get("name") or "" desc = repo.get("description") or "" primary = (repo.get("primaryLanguage") or {}).get("name") or "" repo_snippets.append(f"repo:{name} lang:{primary} desc:{desc}") total_stars += int(repo.get("stargazerCount", 0)) total_forks += int(repo.get("forkCount", 0)) default_branch = repo.get("defaultBranchRef") or {} history = (default_branch.get("target") or {}).get("history", {}) total_commits += int(history.get("totalCount", 0)) for edge in repo.get("languages", {}).get("edges", []): lang_name = edge.get("node", {}).get("name") or "Unknown" language_sizes[lang_name] += int(edge.get("size", 0)) merged_prs = int(user.get("pullRequests", {}).get("totalCount", 0)) contributions = user.get("contributionsCollection", {}) public_activity = user.get("publicActivity", {}) public_commits = int(public_activity.get("publicCommits", contributions.get("totalCommitContributions", 0)) or 0) public_prs_created = int(public_activity.get("publicPRsCreated", contributions.get("totalPullRequestContributions", 0)) or 0) total_contributions = int(contributions.get("contributionCalendar", {}).get("totalContributions", 0)) followers = int(user.get("followers", {}).get("totalCount", 0)) metrics = { "repo_count": len(repositories), "total_commits": total_commits, "merged_prs": merged_prs, "public_commits": public_commits, "public_prs_created": public_prs_created, "total_contributions": total_contributions, "total_stars": total_stars, "total_forks": total_forks, "followers": followers, } return dict(language_sizes), metrics, repo_snippets def _scale_log(value: int, weight: int, factor: float = 2.0) -> int: return min(weight, int(math.log1p(max(value, 0)) * factor)) def _normalize_activity(metrics: dict[str, int], data_source: str) -> int: contribution_divisor = 12 if data_source == "rest-public" else 25 score = 0 score += min(30, metrics["total_contributions"] // contribution_divisor) score += min(18, metrics["repo_count"] * 2) score += min(17, metrics["total_commits"] // 80) score += min(12, metrics["merged_prs"] // 3) score += _scale_log(metrics["total_stars"], 10, 2.2) score += _scale_log(metrics["total_forks"], 6, 2.0) score += _scale_log(metrics["followers"], 7, 1.8) return int(max(0, min(100, score))) def _normalize_consistency(current_streak: int, longest_streak: int) -> int: if longest_streak <= 0: return 0 active_now = min(35, current_streak * 5) proven_consistency = min(65, longest_streak * 3) return int(max(0, min(100, active_now + proven_consistency))) def _language_breakdown(language_sizes: dict[str, int]) -> tuple[str, dict[str, int]]: if not language_sizes: return "Unknown", {} strongest = max(language_sizes.items(), key=lambda x: x[1])[0] total = sum(language_sizes.values()) or 1 breakdown = {lang: int((size / total) * 100) for lang, size in sorted(language_sizes.items(), key=lambda x: x[1], reverse=True)} return strongest, breakdown class AnalyzerWorkflow: def __init__(self) -> None: self._github = GitHubGraphQLClient() self._embedder = CodeEmbeddingService() self._scorer = ScoringEngine(input_dim=self._embedder.embedding_dim) async def run(self, username: str) -> ProfileState: state: ProfileState = {"username": username} raw = await self._github.analyze_user(username) state["graphql_data"] = raw user = raw["data"]["user"] lang_sizes, metrics, snippets = _extract_features(user) strongest_language, breakdown = _language_breakdown(lang_sizes) weeks = user.get("contributionsCollection", {}).get("contributionCalendar", {}).get("weeks", []) streak = compute_streak_from_calendar(weeks) consistency_score = _normalize_consistency(streak.current_streak, streak.longest_streak) embedding = self._embedder.embed_repository_signals(snippets) activity_score = _normalize_activity(metrics, raw.get("source", "graphql")) scored = self._scorer.infer(embedding, activity_score, consistency_score) state["final_report"] = { "username": username, "rating_score": scored.hiring_score, "developer_level": scored.level, "confidence": scored.confidence, "strongest_language": strongest_language, "language_breakdown": breakdown, "hiring_readiness_score": scored.hiring_score, "consistency_score": consistency_score, "public_activity": { "public_commits": metrics["public_commits"], "public_prs_created": metrics["public_prs_created"], }, "graphql_signals": { "total_commits": metrics["total_commits"], "merged_prs": metrics["merged_prs"], "total_contributions": metrics["total_contributions"], }, "streak_data": { "current_streak": streak.current_streak, "longest_streak": streak.longest_streak, }, "model_info": { "embedding_model": "microsoft/codebert-base", "scoring_model": settings.scoring_backend, "embedding_dim": self._embedder.embedding_dim, "embedding_backend": "transformers" if self._embedder.ready else "deterministic-fallback", "data_source": raw.get("source", "graphql"), "public_metrics": { "repositories": metrics["repo_count"], "stars": metrics["total_stars"], "forks": metrics["total_forks"], "followers": metrics["followers"], }, }, } return state