github-repo-analyze / src /analyzer.py
sifat371's picture
added app.py
9c6d7dc
"""
GitHub Repository Intelligence Analyzer
Core analysis engine β€” scoring, complexity, and classification logic.
Author: GSoC 2026 Pre-Task Submission
"""
import os
import math
import datetime
import requests
from dataclasses import dataclass, field
from typing import Optional
# ──────────────────────────────────────────────
# Data model
# ──────────────────────────────────────────────
@dataclass
class RepoReport:
url: str
owner: str
name: str
# Raw GitHub data
stars: int = 0
forks: int = 0
open_issues: int = 0
watchers: int = 0
size_kb: int = 0
language: Optional[str] = None
languages: dict = field(default_factory=dict)
topics: list = field(default_factory=list)
license: Optional[str] = None
created_at: Optional[str] = None
updated_at: Optional[str] = None
pushed_at: Optional[str] = None
default_branch: str = "main"
has_wiki: bool = False
has_pages: bool = False
archived: bool = False
# Fetched separately
contributor_count: int = 0
commit_count_recent: int = 0 # commits in last 90 days
release_count: int = 0
has_ci: bool = False
dependency_files: list = field(default_factory=list)
file_count: int = 0
# Computed scores
activity_score: float = 0.0
complexity_score: float = 0.0
difficulty: str = "Unknown"
fetch_error: Optional[str] = None
# ──────────────────────────────────────────────
# GitHub API client
# ──────────────────────────────────────────────
DEPENDENCY_FILES = [
"requirements.txt",
"Pipfile",
"pyproject.toml", # Python
"package.json",
"yarn.lock",
"pnpm-lock.yaml", # JS/TS
"Cargo.toml",
"go.mod",
"pom.xml",
"build.gradle", # Rust/Go/Java
"Gemfile",
"composer.json",
"mix.exs", # Ruby/PHP/Elixir
]
CI_PATHS = [
".github/workflows",
".travis.yml",
".circleci",
"Jenkinsfile",
".gitlab-ci.yml",
"azure-pipelines.yml",
]
class GitHubClient:
BASE = "https://api.github.com"
def __init__(self, token: Optional[str] = None):
self.session = requests.Session()
self.session.headers.update(
{
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
)
if token:
self.session.headers["Authorization"] = f"Bearer {token}"
def _get(self, path: str, params: dict = None) -> Optional[dict | list]:
"""Safe GET with rate-limit awareness."""
url = f"{self.BASE}{path}"
try:
resp = self.session.get(url, params=params, timeout=15)
if resp.status_code == 403:
remaining = resp.headers.get("X-RateLimit-Remaining", "?")
reset = resp.headers.get("X-RateLimit-Reset", "?")
raise RateLimitError(
f"Rate limit hit. Remaining: {remaining}. Reset epoch: {reset}"
)
if resp.status_code == 404:
return None
resp.raise_for_status()
return resp.json()
except RateLimitError:
raise
except requests.RequestException as e:
raise FetchError(str(e)) from e
def repo(self, owner: str, name: str) -> Optional[dict]:
return self._get(f"/repos/{owner}/{name}")
def languages(self, owner: str, name: str) -> dict:
data = self._get(f"/repos/{owner}/{name}/languages")
return data if isinstance(data, dict) else {}
def contributors(self, owner: str, name: str) -> int:
"""Count contributors β€” uses pagination header trick for speed."""
resp = self.session.get(
f"{self.BASE}/repos/{owner}/{name}/contributors",
params={"per_page": 1, "anon": "true"},
timeout=15,
)
if resp.status_code in (403, 404, 204):
return 0
# GitHub returns last-page number in Link header
link = resp.headers.get("Link", "")
if 'rel="last"' in link:
try:
last_part = [p for p in link.split(",") if 'rel="last"' in p][0]
page_num = int(last_part.split("page=")[-1].split(">")[0])
return page_num
except (IndexError, ValueError):
pass
try:
return len(resp.json())
except Exception:
return 0
def recent_commits(self, owner: str, name: str, days: int = 90) -> int:
since = (
datetime.datetime.utcnow() - datetime.timedelta(days=days)
).isoformat() + "Z"
# Use commits endpoint with since filter; count via pagination
resp = self.session.get(
f"{self.BASE}/repos/{owner}/{name}/commits",
params={"per_page": 1, "since": since},
timeout=15,
)
if resp.status_code in (403, 404, 409): # 409 = empty repo
return 0
link = resp.headers.get("Link", "")
if 'rel="last"' in link:
try:
last_part = [p for p in link.split(",") if 'rel="last"' in p][0]
return int(last_part.split("page=")[-1].split(">")[0])
except (IndexError, ValueError):
pass
try:
return len(resp.json())
except Exception:
return 0
def releases(self, owner: str, name: str) -> int:
data = self._get(f"/repos/{owner}/{name}/releases", params={"per_page": 100})
return len(data) if isinstance(data, list) else 0
def tree_summary(
self, owner: str, name: str, branch: str
) -> tuple[int, list, bool]:
"""Return (file_count, dependency_files_found, has_ci)."""
data = self._get(
f"/repos/{owner}/{name}/git/trees/{branch}",
params={"recursive": "1"},
)
if not data or "tree" not in data:
return 0, [], False
paths = [item["path"] for item in data["tree"] if item["type"] == "blob"]
file_count = len(paths)
# Dependency detection (check file names, not full paths)
filenames = {p.split("/")[-1] for p in paths}
dep_found = [f for f in DEPENDENCY_FILES if f in filenames]
# CI detection
has_ci = any(any(p.startswith(ci) for p in paths) for ci in CI_PATHS)
return file_count, dep_found, has_ci
class RateLimitError(Exception):
pass
class FetchError(Exception):
pass
# ──────────────────────────────────────────────
# Scoring engine
# ──────────────────────────────────────────────
def _days_since(iso_str: Optional[str]) -> float:
"""Days elapsed since an ISO 8601 timestamp. Returns large number if None."""
if not iso_str:
return 9999
try:
dt = datetime.datetime.fromisoformat(iso_str.replace("Z", "+00:00"))
delta = datetime.datetime.now(datetime.timezone.utc) - dt
return delta.total_seconds() / 86400
except Exception:
return 9999
def compute_activity_score(report: RepoReport) -> float:
"""
Activity Score (0–100): measures how alive and maintained a repo is.
Formula (weighted sum, each component 0–1, then Γ—100):
Component Weight Rationale
─────────────────────────────────────────────────────────
recent_commits (90d) 0.30 Primary signal of active dev
contributor_count 0.20 Community health
stars (log-scaled) 0.15 Popularity / interest
forks (log-scaled) 0.10 Adoption / downstream use
open_issues 0.10 Engagement (capped)
recency (days since push) 0.10 Freshness
releases 0.05 Delivery maturity
Caps prevent one outlier metric from dominating.
"""
def log_scale(value: float, cap: int) -> float:
"""Map value to [0,1] using log scale with a cap."""
if value <= 0:
return 0.0
return min(math.log1p(value) / math.log1p(cap), 1.0)
def recency_score(days: float) -> float:
"""1.0 if pushed today, decays to 0.0 at 365 days."""
return max(0.0, 1.0 - days / 365.0)
components = {
"recent_commits": (log_scale(report.commit_count_recent, 500), 0.30),
"contributors": (log_scale(report.contributor_count, 200), 0.20),
"stars": (log_scale(report.stars, 5000), 0.15),
"forks": (log_scale(report.forks, 1000), 0.10),
"open_issues": (log_scale(report.open_issues, 100), 0.10),
"recency": (recency_score(_days_since(report.pushed_at)), 0.10),
"releases": (log_scale(report.release_count, 50), 0.05),
}
score = sum(val * weight for val, weight in components.values())
return round(score * 100, 2)
def compute_complexity_score(report: RepoReport) -> float:
"""
Complexity Score (0–100): estimates technical depth of the repo.
Component Weight Rationale
─────────────────────────────────────────────────────────
file_count (log-scaled) 0.30 Codebase size
language_diversity 0.25 Multi-lang = more complexity
dependency_count 0.20 External surface area
repo_size_kb (log) 0.15 Raw size proxy
has_ci 0.10 Engineering maturity signal
"""
def log_scale(value: float, cap: int) -> float:
if value <= 0:
return 0.0
return min(math.log1p(value) / math.log1p(cap), 1.0)
lang_count = len(report.languages)
lang_diversity = log_scale(lang_count, 10)
dep_count = len(report.dependency_files)
dep_score = log_scale(dep_count, 8)
components = {
"file_count": (log_scale(report.file_count, 2000), 0.30),
"lang_diversity": (lang_diversity, 0.25),
"dependencies": (dep_score, 0.20),
"size_kb": (log_scale(report.size_kb, 100_000), 0.15),
"has_ci": (1.0 if report.has_ci else 0.0, 0.10),
}
score = sum(val * weight for val, weight in components.values())
return round(score * 100, 2)
def classify_difficulty(activity: float, complexity: float) -> str:
"""
Classify learning difficulty using a 2D grid:
High complexity + High activity β†’ Advanced
Either metric high β†’ Intermediate
Both metrics low β†’ Beginner
Thresholds (tuned empirically):
Low : < 30
Medium : 30–60
High : > 60
"""
avg = (activity + complexity) / 2
# Advanced: high complexity AND high activity
if complexity >= 70 and activity >= 75:
return "Advanced"
# Intermediate: moderate-to-high on either dimension
elif avg >= 50 or complexity >= 55 or activity >= 60:
return "Intermediate"
# Beginner: both metrics low
else:
return "Beginner"
# ──────────────────────────────────────────────
# Main analysis pipeline
# ──────────────────────────────────────────────
def parse_github_url(url: str) -> tuple[str, str]:
"""Extract (owner, repo) from various GitHub URL formats."""
url = url.strip().rstrip("/")
# Remove trailing .git
if url.endswith(".git"):
url = url[:-4]
parts = url.replace("https://", "").replace("http://", "").split("/")
# Filter out empty strings and 'github.com'
parts = [p for p in parts if p and p != "github.com"]
if len(parts) < 2:
raise ValueError(f"Cannot parse GitHub URL: {url!r}")
return parts[0], parts[1]
def analyze_repo(url: str, client: GitHubClient) -> RepoReport:
"""Full pipeline: fetch β†’ score β†’ classify β†’ return report."""
try:
owner, name = parse_github_url(url)
except ValueError as e:
return RepoReport(url=url, owner="?", name="?", fetch_error=str(e))
report = RepoReport(url=url, owner=owner, name=name)
try:
# 1. Core repo data
data = client.repo(owner, name)
if data is None:
report.fetch_error = (
"Repository not found (404). Check the URL or repo visibility."
)
return report
report.stars = data.get("stargazers_count", 0)
report.forks = data.get("forks_count", 0)
report.open_issues = data.get("open_issues_count", 0)
report.watchers = data.get("watchers_count", 0)
report.size_kb = data.get("size", 0)
report.language = data.get("language")
report.license = (data.get("license") or {}).get("spdx_id")
report.created_at = data.get("created_at")
report.updated_at = data.get("updated_at")
report.pushed_at = data.get("pushed_at")
report.default_branch = data.get("default_branch", "main")
report.has_wiki = data.get("has_wiki", False)
report.has_pages = data.get("has_pages", False)
report.archived = data.get("archived", False)
report.topics = data.get("topics", [])
# 2. Languages
report.languages = client.languages(owner, name)
# 3. Contributors
report.contributor_count = client.contributors(owner, name)
# 4. Recent commits (90 days)
report.commit_count_recent = client.recent_commits(owner, name)
# 5. Releases
report.release_count = client.releases(owner, name)
# 6. File tree analysis
fc, deps, has_ci = client.tree_summary(owner, name, report.default_branch)
report.file_count = fc
report.dependency_files = deps
report.has_ci = has_ci
except RateLimitError as e:
report.fetch_error = f"GitHub rate limit: {e}"
return report
except FetchError as e:
report.fetch_error = f"Network error: {e}"
return report
except Exception as e:
report.fetch_error = f"Unexpected error: {e}"
return report
# 7. Compute scores
report.activity_score = compute_activity_score(report)
report.complexity_score = compute_complexity_score(report)
report.difficulty = classify_difficulty(
report.activity_score, report.complexity_score
)
return report