Spaces:

MUHAMMADSAADAMIN
/

Github-AI-Reviewer

Running

Github-AI-Reviewer / app /graph /workflow.py

SENODROOM

fixed

1ca4563 25 days ago

6.75 kB

	from __future__ import annotations

	from collections import defaultdict
	import math
	from typing import Any

	from app.ai.embeddings import CodeEmbeddingService
	from app.ai.scoring import ScoringEngine
	from app.clients.github_graphql import GitHubGraphQLClient
	from app.core.config import settings
	from app.clients.streak import compute_streak_from_calendar
	from app.graph.state import ProfileState


	def _extract_features(user: dict[str, Any]) -> tuple[dict[str, int], dict[str, int], list[str]]:
	repositories = user.get("repositories", {}).get("nodes", [])
	language_sizes: dict[str, int] = defaultdict(int)
	repo_snippets: list[str] = []
	total_commits = 0
	total_stars = 0
	total_forks = 0

	for repo in repositories:
	name = repo.get("name") or ""
	desc = repo.get("description") or ""
	primary = (repo.get("primaryLanguage") or {}).get("name") or ""
	repo_snippets.append(f"repo:{name} lang:{primary} desc:{desc}")
	total_stars += int(repo.get("stargazerCount", 0))
	total_forks += int(repo.get("forkCount", 0))

	default_branch = repo.get("defaultBranchRef") or {}
	history = (default_branch.get("target") or {}).get("history", {})
	total_commits += int(history.get("totalCount", 0))

	for edge in repo.get("languages", {}).get("edges", []):
	lang_name = edge.get("node", {}).get("name") or "Unknown"
	language_sizes[lang_name] += int(edge.get("size", 0))

	merged_prs = int(user.get("pullRequests", {}).get("totalCount", 0))
	contributions = user.get("contributionsCollection", {})
	public_activity = user.get("publicActivity", {})
	public_commits = int(public_activity.get("publicCommits", contributions.get("totalCommitContributions", 0)) or 0)
	public_prs_created = int(public_activity.get("publicPRsCreated", contributions.get("totalPullRequestContributions", 0)) or 0)
	total_contributions = int(contributions.get("contributionCalendar", {}).get("totalContributions", 0))
	followers = int(user.get("followers", {}).get("totalCount", 0))
	metrics = {
	"repo_count": len(repositories),
	"total_commits": total_commits,
	"merged_prs": merged_prs,
	"public_commits": public_commits,
	"public_prs_created": public_prs_created,
	"total_contributions": total_contributions,
	"total_stars": total_stars,
	"total_forks": total_forks,
	"followers": followers,
	}
	return dict(language_sizes), metrics, repo_snippets


	def _scale_log(value: int, weight: int, factor: float = 2.0) -> int:
	return min(weight, int(math.log1p(max(value, 0)) * factor))


	def _normalize_activity(metrics: dict[str, int], data_source: str) -> int:
	contribution_divisor = 12 if data_source == "rest-public" else 25
	score = 0
	score += min(30, metrics["total_contributions"] // contribution_divisor)
	score += min(18, metrics["repo_count"] * 2)
	score += min(17, metrics["total_commits"] // 80)
	score += min(12, metrics["merged_prs"] // 3)
	score += _scale_log(metrics["total_stars"], 10, 2.2)
	score += _scale_log(metrics["total_forks"], 6, 2.0)
	score += _scale_log(metrics["followers"], 7, 1.8)
	return int(max(0, min(100, score)))


	def _normalize_consistency(current_streak: int, longest_streak: int) -> int:
	if longest_streak <= 0:
	return 0
	active_now = min(35, current_streak * 5)
	proven_consistency = min(65, longest_streak * 3)
	return int(max(0, min(100, active_now + proven_consistency)))


	def _language_breakdown(language_sizes: dict[str, int]) -> tuple[str, dict[str, int]]:
	if not language_sizes:
	return "Unknown", {}
	strongest = max(language_sizes.items(), key=lambda x: x[1])[0]
	total = sum(language_sizes.values()) or 1
	breakdown = {lang: int((size / total) * 100) for lang, size in sorted(language_sizes.items(), key=lambda x: x[1], reverse=True)}
	return strongest, breakdown


	class AnalyzerWorkflow:
	def __init__(self) -> None:
	self._github = GitHubGraphQLClient()
	self._embedder = CodeEmbeddingService()
	self._scorer = ScoringEngine(input_dim=self._embedder.embedding_dim)

	async def run(self, username: str) -> ProfileState:
	state: ProfileState = {"username": username}
	raw = await self._github.analyze_user(username)
	state["graphql_data"] = raw

	user = raw["data"]["user"]
	lang_sizes, metrics, snippets = _extract_features(user)
	strongest_language, breakdown = _language_breakdown(lang_sizes)

	weeks = user.get("contributionsCollection", {}).get("contributionCalendar", {}).get("weeks", [])
	streak = compute_streak_from_calendar(weeks)
	consistency_score = _normalize_consistency(streak.current_streak, streak.longest_streak)

	embedding = self._embedder.embed_repository_signals(snippets)
	activity_score = _normalize_activity(metrics, raw.get("source", "graphql"))
	scored = self._scorer.infer(embedding, activity_score, consistency_score)

	state["final_report"] = {
	"username": username,
	"rating_score": scored.hiring_score,
	"developer_level": scored.level,
	"confidence": scored.confidence,
	"strongest_language": strongest_language,
	"language_breakdown": breakdown,
	"hiring_readiness_score": scored.hiring_score,
	"consistency_score": consistency_score,
	"public_activity": {
	"public_commits": metrics["public_commits"],
	"public_prs_created": metrics["public_prs_created"],
	},
	"graphql_signals": {
	"total_commits": metrics["total_commits"],
	"merged_prs": metrics["merged_prs"],
	"total_contributions": metrics["total_contributions"],
	},
	"streak_data": {
	"current_streak": streak.current_streak,
	"longest_streak": streak.longest_streak,
	},
	"model_info": {
	"embedding_model": "microsoft/codebert-base",
	"scoring_model": settings.scoring_backend,
	"embedding_dim": self._embedder.embedding_dim,
	"embedding_backend": "transformers" if self._embedder.ready else "deterministic-fallback",
	"data_source": raw.get("source", "graphql"),
	"public_metrics": {
	"repositories": metrics["repo_count"],
	"stars": metrics["total_stars"],
	"forks": metrics["total_forks"],
	"followers": metrics["followers"],
	},
	},
	}
	return state