Github-AI-Reviewer / app /graph /workflow.py
SENODROOM
fixed
1ca4563
Raw
History Blame Contribute Delete
6.75 kB
from __future__ import annotations
from collections import defaultdict
import math
from typing import Any
from app.ai.embeddings import CodeEmbeddingService
from app.ai.scoring import ScoringEngine
from app.clients.github_graphql import GitHubGraphQLClient
from app.core.config import settings
from app.clients.streak import compute_streak_from_calendar
from app.graph.state import ProfileState
def _extract_features(user: dict[str, Any]) -> tuple[dict[str, int], dict[str, int], list[str]]:
repositories = user.get("repositories", {}).get("nodes", [])
language_sizes: dict[str, int] = defaultdict(int)
repo_snippets: list[str] = []
total_commits = 0
total_stars = 0
total_forks = 0
for repo in repositories:
name = repo.get("name") or ""
desc = repo.get("description") or ""
primary = (repo.get("primaryLanguage") or {}).get("name") or ""
repo_snippets.append(f"repo:{name} lang:{primary} desc:{desc}")
total_stars += int(repo.get("stargazerCount", 0))
total_forks += int(repo.get("forkCount", 0))
default_branch = repo.get("defaultBranchRef") or {}
history = (default_branch.get("target") or {}).get("history", {})
total_commits += int(history.get("totalCount", 0))
for edge in repo.get("languages", {}).get("edges", []):
lang_name = edge.get("node", {}).get("name") or "Unknown"
language_sizes[lang_name] += int(edge.get("size", 0))
merged_prs = int(user.get("pullRequests", {}).get("totalCount", 0))
contributions = user.get("contributionsCollection", {})
public_activity = user.get("publicActivity", {})
public_commits = int(public_activity.get("publicCommits", contributions.get("totalCommitContributions", 0)) or 0)
public_prs_created = int(public_activity.get("publicPRsCreated", contributions.get("totalPullRequestContributions", 0)) or 0)
total_contributions = int(contributions.get("contributionCalendar", {}).get("totalContributions", 0))
followers = int(user.get("followers", {}).get("totalCount", 0))
metrics = {
"repo_count": len(repositories),
"total_commits": total_commits,
"merged_prs": merged_prs,
"public_commits": public_commits,
"public_prs_created": public_prs_created,
"total_contributions": total_contributions,
"total_stars": total_stars,
"total_forks": total_forks,
"followers": followers,
}
return dict(language_sizes), metrics, repo_snippets
def _scale_log(value: int, weight: int, factor: float = 2.0) -> int:
return min(weight, int(math.log1p(max(value, 0)) * factor))
def _normalize_activity(metrics: dict[str, int], data_source: str) -> int:
contribution_divisor = 12 if data_source == "rest-public" else 25
score = 0
score += min(30, metrics["total_contributions"] // contribution_divisor)
score += min(18, metrics["repo_count"] * 2)
score += min(17, metrics["total_commits"] // 80)
score += min(12, metrics["merged_prs"] // 3)
score += _scale_log(metrics["total_stars"], 10, 2.2)
score += _scale_log(metrics["total_forks"], 6, 2.0)
score += _scale_log(metrics["followers"], 7, 1.8)
return int(max(0, min(100, score)))
def _normalize_consistency(current_streak: int, longest_streak: int) -> int:
if longest_streak <= 0:
return 0
active_now = min(35, current_streak * 5)
proven_consistency = min(65, longest_streak * 3)
return int(max(0, min(100, active_now + proven_consistency)))
def _language_breakdown(language_sizes: dict[str, int]) -> tuple[str, dict[str, int]]:
if not language_sizes:
return "Unknown", {}
strongest = max(language_sizes.items(), key=lambda x: x[1])[0]
total = sum(language_sizes.values()) or 1
breakdown = {lang: int((size / total) * 100) for lang, size in sorted(language_sizes.items(), key=lambda x: x[1], reverse=True)}
return strongest, breakdown
class AnalyzerWorkflow:
def __init__(self) -> None:
self._github = GitHubGraphQLClient()
self._embedder = CodeEmbeddingService()
self._scorer = ScoringEngine(input_dim=self._embedder.embedding_dim)
async def run(self, username: str) -> ProfileState:
state: ProfileState = {"username": username}
raw = await self._github.analyze_user(username)
state["graphql_data"] = raw
user = raw["data"]["user"]
lang_sizes, metrics, snippets = _extract_features(user)
strongest_language, breakdown = _language_breakdown(lang_sizes)
weeks = user.get("contributionsCollection", {}).get("contributionCalendar", {}).get("weeks", [])
streak = compute_streak_from_calendar(weeks)
consistency_score = _normalize_consistency(streak.current_streak, streak.longest_streak)
embedding = self._embedder.embed_repository_signals(snippets)
activity_score = _normalize_activity(metrics, raw.get("source", "graphql"))
scored = self._scorer.infer(embedding, activity_score, consistency_score)
state["final_report"] = {
"username": username,
"rating_score": scored.hiring_score,
"developer_level": scored.level,
"confidence": scored.confidence,
"strongest_language": strongest_language,
"language_breakdown": breakdown,
"hiring_readiness_score": scored.hiring_score,
"consistency_score": consistency_score,
"public_activity": {
"public_commits": metrics["public_commits"],
"public_prs_created": metrics["public_prs_created"],
},
"graphql_signals": {
"total_commits": metrics["total_commits"],
"merged_prs": metrics["merged_prs"],
"total_contributions": metrics["total_contributions"],
},
"streak_data": {
"current_streak": streak.current_streak,
"longest_streak": streak.longest_streak,
},
"model_info": {
"embedding_model": "microsoft/codebert-base",
"scoring_model": settings.scoring_backend,
"embedding_dim": self._embedder.embedding_dim,
"embedding_backend": "transformers" if self._embedder.ready else "deterministic-fallback",
"data_source": raw.get("source", "graphql"),
"public_metrics": {
"repositories": metrics["repo_count"],
"stars": metrics["total_stars"],
"forks": metrics["total_forks"],
"followers": metrics["followers"],
},
},
}
return state