Spaces:

sifat371
/

github-repo-analyze

Sleeping

App Files Files Community

github-repo-analyze / src /analyzer.py

sifat371

added app.py

9c6d7dc about 2 months ago

raw

history blame contribute delete

15.1 kB

	"""
	GitHub Repository Intelligence Analyzer
	Core analysis engine — scoring, complexity, and classification logic.

	Author: GSoC 2026 Pre-Task Submission
	"""

	import os
	import math
	import datetime
	import requests
	from dataclasses import dataclass, field
	from typing import Optional


	# ──────────────────────────────────────────────
	# Data model
	# ──────────────────────────────────────────────


	@dataclass
	class RepoReport:
	url: str
	owner: str
	name: str

	# Raw GitHub data
	stars: int = 0
	forks: int = 0
	open_issues: int = 0
	watchers: int = 0
	size_kb: int = 0
	language: Optional[str] = None
	languages: dict = field(default_factory=dict)
	topics: list = field(default_factory=list)
	license: Optional[str] = None
	created_at: Optional[str] = None
	updated_at: Optional[str] = None
	pushed_at: Optional[str] = None
	default_branch: str = "main"
	has_wiki: bool = False
	has_pages: bool = False
	archived: bool = False

	# Fetched separately
	contributor_count: int = 0
	commit_count_recent: int = 0 # commits in last 90 days
	release_count: int = 0
	has_ci: bool = False
	dependency_files: list = field(default_factory=list)
	file_count: int = 0

	# Computed scores
	activity_score: float = 0.0
	complexity_score: float = 0.0
	difficulty: str = "Unknown"
	fetch_error: Optional[str] = None


	# ──────────────────────────────────────────────
	# GitHub API client
	# ──────────────────────────────────────────────

	DEPENDENCY_FILES = [
	"requirements.txt",
	"Pipfile",
	"pyproject.toml", # Python
	"package.json",
	"yarn.lock",
	"pnpm-lock.yaml", # JS/TS
	"Cargo.toml",
	"go.mod",
	"pom.xml",
	"build.gradle", # Rust/Go/Java
	"Gemfile",
	"composer.json",
	"mix.exs", # Ruby/PHP/Elixir
	]

	CI_PATHS = [
	".github/workflows",
	".travis.yml",
	".circleci",
	"Jenkinsfile",
	".gitlab-ci.yml",
	"azure-pipelines.yml",
	]


	class GitHubClient:
	BASE = "https://api.github.com"

	def __init__(self, token: Optional[str] = None):
	self.session = requests.Session()
	self.session.headers.update(
	{
	"Accept": "application/vnd.github+json",
	"X-GitHub-Api-Version": "2022-11-28",
	}
	)
	if token:
	self.session.headers["Authorization"] = f"Bearer {token}"

	def _get(self, path: str, params: dict = None) -> Optional[dict \| list]:
	"""Safe GET with rate-limit awareness."""
	url = f"{self.BASE}{path}"
	try:
	resp = self.session.get(url, params=params, timeout=15)
	if resp.status_code == 403:
	remaining = resp.headers.get("X-RateLimit-Remaining", "?")
	reset = resp.headers.get("X-RateLimit-Reset", "?")
	raise RateLimitError(
	f"Rate limit hit. Remaining: {remaining}. Reset epoch: {reset}"
	)
	if resp.status_code == 404:
	return None
	resp.raise_for_status()
	return resp.json()
	except RateLimitError:
	raise
	except requests.RequestException as e:
	raise FetchError(str(e)) from e

	def repo(self, owner: str, name: str) -> Optional[dict]:
	return self._get(f"/repos/{owner}/{name}")

	def languages(self, owner: str, name: str) -> dict:
	data = self._get(f"/repos/{owner}/{name}/languages")
	return data if isinstance(data, dict) else {}

	def contributors(self, owner: str, name: str) -> int:
	"""Count contributors — uses pagination header trick for speed."""
	resp = self.session.get(
	f"{self.BASE}/repos/{owner}/{name}/contributors",
	params={"per_page": 1, "anon": "true"},
	timeout=15,
	)
	if resp.status_code in (403, 404, 204):
	return 0
	# GitHub returns last-page number in Link header
	link = resp.headers.get("Link", "")
	if 'rel="last"' in link:
	try:
	last_part = [p for p in link.split(",") if 'rel="last"' in p][0]
	page_num = int(last_part.split("page=")[-1].split(">")[0])
	return page_num
	except (IndexError, ValueError):
	pass
	try:
	return len(resp.json())
	except Exception:
	return 0

	def recent_commits(self, owner: str, name: str, days: int = 90) -> int:
	since = (
	datetime.datetime.utcnow() - datetime.timedelta(days=days)
	).isoformat() + "Z"
	# Use commits endpoint with since filter; count via pagination
	resp = self.session.get(
	f"{self.BASE}/repos/{owner}/{name}/commits",
	params={"per_page": 1, "since": since},
	timeout=15,
	)
	if resp.status_code in (403, 404, 409): # 409 = empty repo
	return 0
	link = resp.headers.get("Link", "")
	if 'rel="last"' in link:
	try:
	last_part = [p for p in link.split(",") if 'rel="last"' in p][0]
	return int(last_part.split("page=")[-1].split(">")[0])
	except (IndexError, ValueError):
	pass
	try:
	return len(resp.json())
	except Exception:
	return 0

	def releases(self, owner: str, name: str) -> int:
	data = self._get(f"/repos/{owner}/{name}/releases", params={"per_page": 100})
	return len(data) if isinstance(data, list) else 0

	def tree_summary(
	self, owner: str, name: str, branch: str
	) -> tuple[int, list, bool]:
	"""Return (file_count, dependency_files_found, has_ci)."""
	data = self._get(
	f"/repos/{owner}/{name}/git/trees/{branch}",
	params={"recursive": "1"},
	)
	if not data or "tree" not in data:
	return 0, [], False

	paths = [item["path"] for item in data["tree"] if item["type"] == "blob"]
	file_count = len(paths)

	# Dependency detection (check file names, not full paths)
	filenames = {p.split("/")[-1] for p in paths}
	dep_found = [f for f in DEPENDENCY_FILES if f in filenames]

	# CI detection
	has_ci = any(any(p.startswith(ci) for p in paths) for ci in CI_PATHS)

	return file_count, dep_found, has_ci


	class RateLimitError(Exception):
	pass


	class FetchError(Exception):
	pass


	# ──────────────────────────────────────────────
	# Scoring engine
	# ──────────────────────────────────────────────


	def _days_since(iso_str: Optional[str]) -> float:
	"""Days elapsed since an ISO 8601 timestamp. Returns large number if None."""
	if not iso_str:
	return 9999
	try:
	dt = datetime.datetime.fromisoformat(iso_str.replace("Z", "+00:00"))
	delta = datetime.datetime.now(datetime.timezone.utc) - dt
	return delta.total_seconds() / 86400
	except Exception:
	return 9999


	def compute_activity_score(report: RepoReport) -> float:
	"""
	Activity Score (0–100): measures how alive and maintained a repo is.

	Formula (weighted sum, each component 0–1, then ×100):

	Component Weight Rationale
	─────────────────────────────────────────────────────────
	recent_commits (90d) 0.30 Primary signal of active dev
	contributor_count 0.20 Community health
	stars (log-scaled) 0.15 Popularity / interest
	forks (log-scaled) 0.10 Adoption / downstream use
	open_issues 0.10 Engagement (capped)
	recency (days since push) 0.10 Freshness
	releases 0.05 Delivery maturity

	Caps prevent one outlier metric from dominating.
	"""

	def log_scale(value: float, cap: int) -> float:
	"""Map value to [0,1] using log scale with a cap."""
	if value <= 0:
	return 0.0
	return min(math.log1p(value) / math.log1p(cap), 1.0)

	def recency_score(days: float) -> float:
	"""1.0 if pushed today, decays to 0.0 at 365 days."""
	return max(0.0, 1.0 - days / 365.0)

	components = {
	"recent_commits": (log_scale(report.commit_count_recent, 500), 0.30),
	"contributors": (log_scale(report.contributor_count, 200), 0.20),
	"stars": (log_scale(report.stars, 5000), 0.15),
	"forks": (log_scale(report.forks, 1000), 0.10),
	"open_issues": (log_scale(report.open_issues, 100), 0.10),
	"recency": (recency_score(_days_since(report.pushed_at)), 0.10),
	"releases": (log_scale(report.release_count, 50), 0.05),
	}

	score = sum(val * weight for val, weight in components.values())
	return round(score * 100, 2)


	def compute_complexity_score(report: RepoReport) -> float:
	"""
	Complexity Score (0–100): estimates technical depth of the repo.

	Component Weight Rationale
	─────────────────────────────────────────────────────────
	file_count (log-scaled) 0.30 Codebase size
	language_diversity 0.25 Multi-lang = more complexity
	dependency_count 0.20 External surface area
	repo_size_kb (log) 0.15 Raw size proxy
	has_ci 0.10 Engineering maturity signal
	"""

	def log_scale(value: float, cap: int) -> float:
	if value <= 0:
	return 0.0
	return min(math.log1p(value) / math.log1p(cap), 1.0)

	lang_count = len(report.languages)
	lang_diversity = log_scale(lang_count, 10)

	dep_count = len(report.dependency_files)
	dep_score = log_scale(dep_count, 8)

	components = {
	"file_count": (log_scale(report.file_count, 2000), 0.30),
	"lang_diversity": (lang_diversity, 0.25),
	"dependencies": (dep_score, 0.20),
	"size_kb": (log_scale(report.size_kb, 100_000), 0.15),
	"has_ci": (1.0 if report.has_ci else 0.0, 0.10),
	}

	score = sum(val * weight for val, weight in components.values())
	return round(score * 100, 2)


	def classify_difficulty(activity: float, complexity: float) -> str:
	"""
	Classify learning difficulty using a 2D grid:

	High complexity + High activity → Advanced
	Either metric high → Intermediate
	Both metrics low → Beginner

	Thresholds (tuned empirically):
	Low : < 30
	Medium : 30–60
	High : > 60
	"""
	avg = (activity + complexity) / 2

	# Advanced: high complexity AND high activity
	if complexity >= 70 and activity >= 75:
	return "Advanced"
	# Intermediate: moderate-to-high on either dimension
	elif avg >= 50 or complexity >= 55 or activity >= 60:
	return "Intermediate"
	# Beginner: both metrics low
	else:
	return "Beginner"


	# ──────────────────────────────────────────────
	# Main analysis pipeline
	# ──────────────────────────────────────────────


	def parse_github_url(url: str) -> tuple[str, str]:
	"""Extract (owner, repo) from various GitHub URL formats."""
	url = url.strip().rstrip("/")
	# Remove trailing .git
	if url.endswith(".git"):
	url = url[:-4]
	parts = url.replace("https://", "").replace("http://", "").split("/")
	# Filter out empty strings and 'github.com'
	parts = [p for p in parts if p and p != "github.com"]
	if len(parts) < 2:
	raise ValueError(f"Cannot parse GitHub URL: {url!r}")
	return parts[0], parts[1]


	def analyze_repo(url: str, client: GitHubClient) -> RepoReport:
	"""Full pipeline: fetch → score → classify → return report."""
	try:
	owner, name = parse_github_url(url)
	except ValueError as e:
	return RepoReport(url=url, owner="?", name="?", fetch_error=str(e))

	report = RepoReport(url=url, owner=owner, name=name)

	try:
	# 1. Core repo data
	data = client.repo(owner, name)
	if data is None:
	report.fetch_error = (
	"Repository not found (404). Check the URL or repo visibility."
	)
	return report

	report.stars = data.get("stargazers_count", 0)
	report.forks = data.get("forks_count", 0)
	report.open_issues = data.get("open_issues_count", 0)
	report.watchers = data.get("watchers_count", 0)
	report.size_kb = data.get("size", 0)
	report.language = data.get("language")
	report.license = (data.get("license") or {}).get("spdx_id")
	report.created_at = data.get("created_at")
	report.updated_at = data.get("updated_at")
	report.pushed_at = data.get("pushed_at")
	report.default_branch = data.get("default_branch", "main")
	report.has_wiki = data.get("has_wiki", False)
	report.has_pages = data.get("has_pages", False)
	report.archived = data.get("archived", False)
	report.topics = data.get("topics", [])

	# 2. Languages
	report.languages = client.languages(owner, name)

	# 3. Contributors
	report.contributor_count = client.contributors(owner, name)

	# 4. Recent commits (90 days)
	report.commit_count_recent = client.recent_commits(owner, name)

	# 5. Releases
	report.release_count = client.releases(owner, name)

	# 6. File tree analysis
	fc, deps, has_ci = client.tree_summary(owner, name, report.default_branch)
	report.file_count = fc
	report.dependency_files = deps
	report.has_ci = has_ci

	except RateLimitError as e:
	report.fetch_error = f"GitHub rate limit: {e}"
	return report
	except FetchError as e:
	report.fetch_error = f"Network error: {e}"
	return report
	except Exception as e:
	report.fetch_error = f"Unexpected error: {e}"
	return report

	# 7. Compute scores
	report.activity_score = compute_activity_score(report)
	report.complexity_score = compute_complexity_score(report)
	report.difficulty = classify_difficulty(
	report.activity_score, report.complexity_score
	)

	return report