""" GitHub Repository Intelligence Analyzer Core analysis engine — scoring, complexity, and classification logic. Author: GSoC 2026 Pre-Task Submission """ import os import math import datetime import requests from dataclasses import dataclass, field from typing import Optional # ────────────────────────────────────────────── # Data model # ────────────────────────────────────────────── @dataclass class RepoReport: url: str owner: str name: str # Raw GitHub data stars: int = 0 forks: int = 0 open_issues: int = 0 watchers: int = 0 size_kb: int = 0 language: Optional[str] = None languages: dict = field(default_factory=dict) topics: list = field(default_factory=list) license: Optional[str] = None created_at: Optional[str] = None updated_at: Optional[str] = None pushed_at: Optional[str] = None default_branch: str = "main" has_wiki: bool = False has_pages: bool = False archived: bool = False # Fetched separately contributor_count: int = 0 commit_count_recent: int = 0 # commits in last 90 days release_count: int = 0 has_ci: bool = False dependency_files: list = field(default_factory=list) file_count: int = 0 # Computed scores activity_score: float = 0.0 complexity_score: float = 0.0 difficulty: str = "Unknown" fetch_error: Optional[str] = None # ────────────────────────────────────────────── # GitHub API client # ────────────────────────────────────────────── DEPENDENCY_FILES = [ "requirements.txt", "Pipfile", "pyproject.toml", # Python "package.json", "yarn.lock", "pnpm-lock.yaml", # JS/TS "Cargo.toml", "go.mod", "pom.xml", "build.gradle", # Rust/Go/Java "Gemfile", "composer.json", "mix.exs", # Ruby/PHP/Elixir ] CI_PATHS = [ ".github/workflows", ".travis.yml", ".circleci", "Jenkinsfile", ".gitlab-ci.yml", "azure-pipelines.yml", ] class GitHubClient: BASE = "https://api.github.com" def __init__(self, token: Optional[str] = None): self.session = requests.Session() self.session.headers.update( { "Accept": "application/vnd.github+json", "X-GitHub-Api-Version": "2022-11-28", } ) if token: self.session.headers["Authorization"] = f"Bearer {token}" def _get(self, path: str, params: dict = None) -> Optional[dict | list]: """Safe GET with rate-limit awareness.""" url = f"{self.BASE}{path}" try: resp = self.session.get(url, params=params, timeout=15) if resp.status_code == 403: remaining = resp.headers.get("X-RateLimit-Remaining", "?") reset = resp.headers.get("X-RateLimit-Reset", "?") raise RateLimitError( f"Rate limit hit. Remaining: {remaining}. Reset epoch: {reset}" ) if resp.status_code == 404: return None resp.raise_for_status() return resp.json() except RateLimitError: raise except requests.RequestException as e: raise FetchError(str(e)) from e def repo(self, owner: str, name: str) -> Optional[dict]: return self._get(f"/repos/{owner}/{name}") def languages(self, owner: str, name: str) -> dict: data = self._get(f"/repos/{owner}/{name}/languages") return data if isinstance(data, dict) else {} def contributors(self, owner: str, name: str) -> int: """Count contributors — uses pagination header trick for speed.""" resp = self.session.get( f"{self.BASE}/repos/{owner}/{name}/contributors", params={"per_page": 1, "anon": "true"}, timeout=15, ) if resp.status_code in (403, 404, 204): return 0 # GitHub returns last-page number in Link header link = resp.headers.get("Link", "") if 'rel="last"' in link: try: last_part = [p for p in link.split(",") if 'rel="last"' in p][0] page_num = int(last_part.split("page=")[-1].split(">")[0]) return page_num except (IndexError, ValueError): pass try: return len(resp.json()) except Exception: return 0 def recent_commits(self, owner: str, name: str, days: int = 90) -> int: since = ( datetime.datetime.utcnow() - datetime.timedelta(days=days) ).isoformat() + "Z" # Use commits endpoint with since filter; count via pagination resp = self.session.get( f"{self.BASE}/repos/{owner}/{name}/commits", params={"per_page": 1, "since": since}, timeout=15, ) if resp.status_code in (403, 404, 409): # 409 = empty repo return 0 link = resp.headers.get("Link", "") if 'rel="last"' in link: try: last_part = [p for p in link.split(",") if 'rel="last"' in p][0] return int(last_part.split("page=")[-1].split(">")[0]) except (IndexError, ValueError): pass try: return len(resp.json()) except Exception: return 0 def releases(self, owner: str, name: str) -> int: data = self._get(f"/repos/{owner}/{name}/releases", params={"per_page": 100}) return len(data) if isinstance(data, list) else 0 def tree_summary( self, owner: str, name: str, branch: str ) -> tuple[int, list, bool]: """Return (file_count, dependency_files_found, has_ci).""" data = self._get( f"/repos/{owner}/{name}/git/trees/{branch}", params={"recursive": "1"}, ) if not data or "tree" not in data: return 0, [], False paths = [item["path"] for item in data["tree"] if item["type"] == "blob"] file_count = len(paths) # Dependency detection (check file names, not full paths) filenames = {p.split("/")[-1] for p in paths} dep_found = [f for f in DEPENDENCY_FILES if f in filenames] # CI detection has_ci = any(any(p.startswith(ci) for p in paths) for ci in CI_PATHS) return file_count, dep_found, has_ci class RateLimitError(Exception): pass class FetchError(Exception): pass # ────────────────────────────────────────────── # Scoring engine # ────────────────────────────────────────────── def _days_since(iso_str: Optional[str]) -> float: """Days elapsed since an ISO 8601 timestamp. Returns large number if None.""" if not iso_str: return 9999 try: dt = datetime.datetime.fromisoformat(iso_str.replace("Z", "+00:00")) delta = datetime.datetime.now(datetime.timezone.utc) - dt return delta.total_seconds() / 86400 except Exception: return 9999 def compute_activity_score(report: RepoReport) -> float: """ Activity Score (0–100): measures how alive and maintained a repo is. Formula (weighted sum, each component 0–1, then ×100): Component Weight Rationale ───────────────────────────────────────────────────────── recent_commits (90d) 0.30 Primary signal of active dev contributor_count 0.20 Community health stars (log-scaled) 0.15 Popularity / interest forks (log-scaled) 0.10 Adoption / downstream use open_issues 0.10 Engagement (capped) recency (days since push) 0.10 Freshness releases 0.05 Delivery maturity Caps prevent one outlier metric from dominating. """ def log_scale(value: float, cap: int) -> float: """Map value to [0,1] using log scale with a cap.""" if value <= 0: return 0.0 return min(math.log1p(value) / math.log1p(cap), 1.0) def recency_score(days: float) -> float: """1.0 if pushed today, decays to 0.0 at 365 days.""" return max(0.0, 1.0 - days / 365.0) components = { "recent_commits": (log_scale(report.commit_count_recent, 500), 0.30), "contributors": (log_scale(report.contributor_count, 200), 0.20), "stars": (log_scale(report.stars, 5000), 0.15), "forks": (log_scale(report.forks, 1000), 0.10), "open_issues": (log_scale(report.open_issues, 100), 0.10), "recency": (recency_score(_days_since(report.pushed_at)), 0.10), "releases": (log_scale(report.release_count, 50), 0.05), } score = sum(val * weight for val, weight in components.values()) return round(score * 100, 2) def compute_complexity_score(report: RepoReport) -> float: """ Complexity Score (0–100): estimates technical depth of the repo. Component Weight Rationale ───────────────────────────────────────────────────────── file_count (log-scaled) 0.30 Codebase size language_diversity 0.25 Multi-lang = more complexity dependency_count 0.20 External surface area repo_size_kb (log) 0.15 Raw size proxy has_ci 0.10 Engineering maturity signal """ def log_scale(value: float, cap: int) -> float: if value <= 0: return 0.0 return min(math.log1p(value) / math.log1p(cap), 1.0) lang_count = len(report.languages) lang_diversity = log_scale(lang_count, 10) dep_count = len(report.dependency_files) dep_score = log_scale(dep_count, 8) components = { "file_count": (log_scale(report.file_count, 2000), 0.30), "lang_diversity": (lang_diversity, 0.25), "dependencies": (dep_score, 0.20), "size_kb": (log_scale(report.size_kb, 100_000), 0.15), "has_ci": (1.0 if report.has_ci else 0.0, 0.10), } score = sum(val * weight for val, weight in components.values()) return round(score * 100, 2) def classify_difficulty(activity: float, complexity: float) -> str: """ Classify learning difficulty using a 2D grid: High complexity + High activity → Advanced Either metric high → Intermediate Both metrics low → Beginner Thresholds (tuned empirically): Low : < 30 Medium : 30–60 High : > 60 """ avg = (activity + complexity) / 2 # Advanced: high complexity AND high activity if complexity >= 70 and activity >= 75: return "Advanced" # Intermediate: moderate-to-high on either dimension elif avg >= 50 or complexity >= 55 or activity >= 60: return "Intermediate" # Beginner: both metrics low else: return "Beginner" # ────────────────────────────────────────────── # Main analysis pipeline # ────────────────────────────────────────────── def parse_github_url(url: str) -> tuple[str, str]: """Extract (owner, repo) from various GitHub URL formats.""" url = url.strip().rstrip("/") # Remove trailing .git if url.endswith(".git"): url = url[:-4] parts = url.replace("https://", "").replace("http://", "").split("/") # Filter out empty strings and 'github.com' parts = [p for p in parts if p and p != "github.com"] if len(parts) < 2: raise ValueError(f"Cannot parse GitHub URL: {url!r}") return parts[0], parts[1] def analyze_repo(url: str, client: GitHubClient) -> RepoReport: """Full pipeline: fetch → score → classify → return report.""" try: owner, name = parse_github_url(url) except ValueError as e: return RepoReport(url=url, owner="?", name="?", fetch_error=str(e)) report = RepoReport(url=url, owner=owner, name=name) try: # 1. Core repo data data = client.repo(owner, name) if data is None: report.fetch_error = ( "Repository not found (404). Check the URL or repo visibility." ) return report report.stars = data.get("stargazers_count", 0) report.forks = data.get("forks_count", 0) report.open_issues = data.get("open_issues_count", 0) report.watchers = data.get("watchers_count", 0) report.size_kb = data.get("size", 0) report.language = data.get("language") report.license = (data.get("license") or {}).get("spdx_id") report.created_at = data.get("created_at") report.updated_at = data.get("updated_at") report.pushed_at = data.get("pushed_at") report.default_branch = data.get("default_branch", "main") report.has_wiki = data.get("has_wiki", False) report.has_pages = data.get("has_pages", False) report.archived = data.get("archived", False) report.topics = data.get("topics", []) # 2. Languages report.languages = client.languages(owner, name) # 3. Contributors report.contributor_count = client.contributors(owner, name) # 4. Recent commits (90 days) report.commit_count_recent = client.recent_commits(owner, name) # 5. Releases report.release_count = client.releases(owner, name) # 6. File tree analysis fc, deps, has_ci = client.tree_summary(owner, name, report.default_branch) report.file_count = fc report.dependency_files = deps report.has_ci = has_ci except RateLimitError as e: report.fetch_error = f"GitHub rate limit: {e}" return report except FetchError as e: report.fetch_error = f"Network error: {e}" return report except Exception as e: report.fetch_error = f"Unexpected error: {e}" return report # 7. Compute scores report.activity_score = compute_activity_score(report) report.complexity_score = compute_complexity_score(report) report.difficulty = classify_difficulty( report.activity_score, report.complexity_score ) return report