Spaces:
Sleeping
Sleeping
| """ | |
| GitHub Repository Intelligence Analyzer | |
| Core analysis engine β scoring, complexity, and classification logic. | |
| Author: GSoC 2026 Pre-Task Submission | |
| """ | |
| import os | |
| import math | |
| import datetime | |
| import requests | |
| from dataclasses import dataclass, field | |
| from typing import Optional | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Data model | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| class RepoReport: | |
| url: str | |
| owner: str | |
| name: str | |
| # Raw GitHub data | |
| stars: int = 0 | |
| forks: int = 0 | |
| open_issues: int = 0 | |
| watchers: int = 0 | |
| size_kb: int = 0 | |
| language: Optional[str] = None | |
| languages: dict = field(default_factory=dict) | |
| topics: list = field(default_factory=list) | |
| license: Optional[str] = None | |
| created_at: Optional[str] = None | |
| updated_at: Optional[str] = None | |
| pushed_at: Optional[str] = None | |
| default_branch: str = "main" | |
| has_wiki: bool = False | |
| has_pages: bool = False | |
| archived: bool = False | |
| # Fetched separately | |
| contributor_count: int = 0 | |
| commit_count_recent: int = 0 # commits in last 90 days | |
| release_count: int = 0 | |
| has_ci: bool = False | |
| dependency_files: list = field(default_factory=list) | |
| file_count: int = 0 | |
| # Computed scores | |
| activity_score: float = 0.0 | |
| complexity_score: float = 0.0 | |
| difficulty: str = "Unknown" | |
| fetch_error: Optional[str] = None | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # GitHub API client | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| DEPENDENCY_FILES = [ | |
| "requirements.txt", | |
| "Pipfile", | |
| "pyproject.toml", # Python | |
| "package.json", | |
| "yarn.lock", | |
| "pnpm-lock.yaml", # JS/TS | |
| "Cargo.toml", | |
| "go.mod", | |
| "pom.xml", | |
| "build.gradle", # Rust/Go/Java | |
| "Gemfile", | |
| "composer.json", | |
| "mix.exs", # Ruby/PHP/Elixir | |
| ] | |
| CI_PATHS = [ | |
| ".github/workflows", | |
| ".travis.yml", | |
| ".circleci", | |
| "Jenkinsfile", | |
| ".gitlab-ci.yml", | |
| "azure-pipelines.yml", | |
| ] | |
| class GitHubClient: | |
| BASE = "https://api.github.com" | |
| def __init__(self, token: Optional[str] = None): | |
| self.session = requests.Session() | |
| self.session.headers.update( | |
| { | |
| "Accept": "application/vnd.github+json", | |
| "X-GitHub-Api-Version": "2022-11-28", | |
| } | |
| ) | |
| if token: | |
| self.session.headers["Authorization"] = f"Bearer {token}" | |
| def _get(self, path: str, params: dict = None) -> Optional[dict | list]: | |
| """Safe GET with rate-limit awareness.""" | |
| url = f"{self.BASE}{path}" | |
| try: | |
| resp = self.session.get(url, params=params, timeout=15) | |
| if resp.status_code == 403: | |
| remaining = resp.headers.get("X-RateLimit-Remaining", "?") | |
| reset = resp.headers.get("X-RateLimit-Reset", "?") | |
| raise RateLimitError( | |
| f"Rate limit hit. Remaining: {remaining}. Reset epoch: {reset}" | |
| ) | |
| if resp.status_code == 404: | |
| return None | |
| resp.raise_for_status() | |
| return resp.json() | |
| except RateLimitError: | |
| raise | |
| except requests.RequestException as e: | |
| raise FetchError(str(e)) from e | |
| def repo(self, owner: str, name: str) -> Optional[dict]: | |
| return self._get(f"/repos/{owner}/{name}") | |
| def languages(self, owner: str, name: str) -> dict: | |
| data = self._get(f"/repos/{owner}/{name}/languages") | |
| return data if isinstance(data, dict) else {} | |
| def contributors(self, owner: str, name: str) -> int: | |
| """Count contributors β uses pagination header trick for speed.""" | |
| resp = self.session.get( | |
| f"{self.BASE}/repos/{owner}/{name}/contributors", | |
| params={"per_page": 1, "anon": "true"}, | |
| timeout=15, | |
| ) | |
| if resp.status_code in (403, 404, 204): | |
| return 0 | |
| # GitHub returns last-page number in Link header | |
| link = resp.headers.get("Link", "") | |
| if 'rel="last"' in link: | |
| try: | |
| last_part = [p for p in link.split(",") if 'rel="last"' in p][0] | |
| page_num = int(last_part.split("page=")[-1].split(">")[0]) | |
| return page_num | |
| except (IndexError, ValueError): | |
| pass | |
| try: | |
| return len(resp.json()) | |
| except Exception: | |
| return 0 | |
| def recent_commits(self, owner: str, name: str, days: int = 90) -> int: | |
| since = ( | |
| datetime.datetime.utcnow() - datetime.timedelta(days=days) | |
| ).isoformat() + "Z" | |
| # Use commits endpoint with since filter; count via pagination | |
| resp = self.session.get( | |
| f"{self.BASE}/repos/{owner}/{name}/commits", | |
| params={"per_page": 1, "since": since}, | |
| timeout=15, | |
| ) | |
| if resp.status_code in (403, 404, 409): # 409 = empty repo | |
| return 0 | |
| link = resp.headers.get("Link", "") | |
| if 'rel="last"' in link: | |
| try: | |
| last_part = [p for p in link.split(",") if 'rel="last"' in p][0] | |
| return int(last_part.split("page=")[-1].split(">")[0]) | |
| except (IndexError, ValueError): | |
| pass | |
| try: | |
| return len(resp.json()) | |
| except Exception: | |
| return 0 | |
| def releases(self, owner: str, name: str) -> int: | |
| data = self._get(f"/repos/{owner}/{name}/releases", params={"per_page": 100}) | |
| return len(data) if isinstance(data, list) else 0 | |
| def tree_summary( | |
| self, owner: str, name: str, branch: str | |
| ) -> tuple[int, list, bool]: | |
| """Return (file_count, dependency_files_found, has_ci).""" | |
| data = self._get( | |
| f"/repos/{owner}/{name}/git/trees/{branch}", | |
| params={"recursive": "1"}, | |
| ) | |
| if not data or "tree" not in data: | |
| return 0, [], False | |
| paths = [item["path"] for item in data["tree"] if item["type"] == "blob"] | |
| file_count = len(paths) | |
| # Dependency detection (check file names, not full paths) | |
| filenames = {p.split("/")[-1] for p in paths} | |
| dep_found = [f for f in DEPENDENCY_FILES if f in filenames] | |
| # CI detection | |
| has_ci = any(any(p.startswith(ci) for p in paths) for ci in CI_PATHS) | |
| return file_count, dep_found, has_ci | |
| class RateLimitError(Exception): | |
| pass | |
| class FetchError(Exception): | |
| pass | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Scoring engine | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _days_since(iso_str: Optional[str]) -> float: | |
| """Days elapsed since an ISO 8601 timestamp. Returns large number if None.""" | |
| if not iso_str: | |
| return 9999 | |
| try: | |
| dt = datetime.datetime.fromisoformat(iso_str.replace("Z", "+00:00")) | |
| delta = datetime.datetime.now(datetime.timezone.utc) - dt | |
| return delta.total_seconds() / 86400 | |
| except Exception: | |
| return 9999 | |
| def compute_activity_score(report: RepoReport) -> float: | |
| """ | |
| Activity Score (0β100): measures how alive and maintained a repo is. | |
| Formula (weighted sum, each component 0β1, then Γ100): | |
| Component Weight Rationale | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| recent_commits (90d) 0.30 Primary signal of active dev | |
| contributor_count 0.20 Community health | |
| stars (log-scaled) 0.15 Popularity / interest | |
| forks (log-scaled) 0.10 Adoption / downstream use | |
| open_issues 0.10 Engagement (capped) | |
| recency (days since push) 0.10 Freshness | |
| releases 0.05 Delivery maturity | |
| Caps prevent one outlier metric from dominating. | |
| """ | |
| def log_scale(value: float, cap: int) -> float: | |
| """Map value to [0,1] using log scale with a cap.""" | |
| if value <= 0: | |
| return 0.0 | |
| return min(math.log1p(value) / math.log1p(cap), 1.0) | |
| def recency_score(days: float) -> float: | |
| """1.0 if pushed today, decays to 0.0 at 365 days.""" | |
| return max(0.0, 1.0 - days / 365.0) | |
| components = { | |
| "recent_commits": (log_scale(report.commit_count_recent, 500), 0.30), | |
| "contributors": (log_scale(report.contributor_count, 200), 0.20), | |
| "stars": (log_scale(report.stars, 5000), 0.15), | |
| "forks": (log_scale(report.forks, 1000), 0.10), | |
| "open_issues": (log_scale(report.open_issues, 100), 0.10), | |
| "recency": (recency_score(_days_since(report.pushed_at)), 0.10), | |
| "releases": (log_scale(report.release_count, 50), 0.05), | |
| } | |
| score = sum(val * weight for val, weight in components.values()) | |
| return round(score * 100, 2) | |
| def compute_complexity_score(report: RepoReport) -> float: | |
| """ | |
| Complexity Score (0β100): estimates technical depth of the repo. | |
| Component Weight Rationale | |
| βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| file_count (log-scaled) 0.30 Codebase size | |
| language_diversity 0.25 Multi-lang = more complexity | |
| dependency_count 0.20 External surface area | |
| repo_size_kb (log) 0.15 Raw size proxy | |
| has_ci 0.10 Engineering maturity signal | |
| """ | |
| def log_scale(value: float, cap: int) -> float: | |
| if value <= 0: | |
| return 0.0 | |
| return min(math.log1p(value) / math.log1p(cap), 1.0) | |
| lang_count = len(report.languages) | |
| lang_diversity = log_scale(lang_count, 10) | |
| dep_count = len(report.dependency_files) | |
| dep_score = log_scale(dep_count, 8) | |
| components = { | |
| "file_count": (log_scale(report.file_count, 2000), 0.30), | |
| "lang_diversity": (lang_diversity, 0.25), | |
| "dependencies": (dep_score, 0.20), | |
| "size_kb": (log_scale(report.size_kb, 100_000), 0.15), | |
| "has_ci": (1.0 if report.has_ci else 0.0, 0.10), | |
| } | |
| score = sum(val * weight for val, weight in components.values()) | |
| return round(score * 100, 2) | |
| def classify_difficulty(activity: float, complexity: float) -> str: | |
| """ | |
| Classify learning difficulty using a 2D grid: | |
| High complexity + High activity β Advanced | |
| Either metric high β Intermediate | |
| Both metrics low β Beginner | |
| Thresholds (tuned empirically): | |
| Low : < 30 | |
| Medium : 30β60 | |
| High : > 60 | |
| """ | |
| avg = (activity + complexity) / 2 | |
| # Advanced: high complexity AND high activity | |
| if complexity >= 70 and activity >= 75: | |
| return "Advanced" | |
| # Intermediate: moderate-to-high on either dimension | |
| elif avg >= 50 or complexity >= 55 or activity >= 60: | |
| return "Intermediate" | |
| # Beginner: both metrics low | |
| else: | |
| return "Beginner" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Main analysis pipeline | |
| # ββββββββββββββββββββββββββββββββββββββββββββββ | |
| def parse_github_url(url: str) -> tuple[str, str]: | |
| """Extract (owner, repo) from various GitHub URL formats.""" | |
| url = url.strip().rstrip("/") | |
| # Remove trailing .git | |
| if url.endswith(".git"): | |
| url = url[:-4] | |
| parts = url.replace("https://", "").replace("http://", "").split("/") | |
| # Filter out empty strings and 'github.com' | |
| parts = [p for p in parts if p and p != "github.com"] | |
| if len(parts) < 2: | |
| raise ValueError(f"Cannot parse GitHub URL: {url!r}") | |
| return parts[0], parts[1] | |
| def analyze_repo(url: str, client: GitHubClient) -> RepoReport: | |
| """Full pipeline: fetch β score β classify β return report.""" | |
| try: | |
| owner, name = parse_github_url(url) | |
| except ValueError as e: | |
| return RepoReport(url=url, owner="?", name="?", fetch_error=str(e)) | |
| report = RepoReport(url=url, owner=owner, name=name) | |
| try: | |
| # 1. Core repo data | |
| data = client.repo(owner, name) | |
| if data is None: | |
| report.fetch_error = ( | |
| "Repository not found (404). Check the URL or repo visibility." | |
| ) | |
| return report | |
| report.stars = data.get("stargazers_count", 0) | |
| report.forks = data.get("forks_count", 0) | |
| report.open_issues = data.get("open_issues_count", 0) | |
| report.watchers = data.get("watchers_count", 0) | |
| report.size_kb = data.get("size", 0) | |
| report.language = data.get("language") | |
| report.license = (data.get("license") or {}).get("spdx_id") | |
| report.created_at = data.get("created_at") | |
| report.updated_at = data.get("updated_at") | |
| report.pushed_at = data.get("pushed_at") | |
| report.default_branch = data.get("default_branch", "main") | |
| report.has_wiki = data.get("has_wiki", False) | |
| report.has_pages = data.get("has_pages", False) | |
| report.archived = data.get("archived", False) | |
| report.topics = data.get("topics", []) | |
| # 2. Languages | |
| report.languages = client.languages(owner, name) | |
| # 3. Contributors | |
| report.contributor_count = client.contributors(owner, name) | |
| # 4. Recent commits (90 days) | |
| report.commit_count_recent = client.recent_commits(owner, name) | |
| # 5. Releases | |
| report.release_count = client.releases(owner, name) | |
| # 6. File tree analysis | |
| fc, deps, has_ci = client.tree_summary(owner, name, report.default_branch) | |
| report.file_count = fc | |
| report.dependency_files = deps | |
| report.has_ci = has_ci | |
| except RateLimitError as e: | |
| report.fetch_error = f"GitHub rate limit: {e}" | |
| return report | |
| except FetchError as e: | |
| report.fetch_error = f"Network error: {e}" | |
| return report | |
| except Exception as e: | |
| report.fetch_error = f"Unexpected error: {e}" | |
| return report | |
| # 7. Compute scores | |
| report.activity_score = compute_activity_score(report) | |
| report.complexity_score = compute_complexity_score(report) | |
| report.difficulty = classify_difficulty( | |
| report.activity_score, report.complexity_score | |
| ) | |
| return report | |