Spaces:

Moealsarraj
/

devkit

Sleeping

devkit / app /tools /doc_forge /github_fetcher.py

Mohammed AL Sarraj

initial deploy

950dcd2 about 1 month ago

4.39 kB

	"""Fetches repository structure and key file contents from GitHub API."""
	import os
	import re
	import requests

	_GITHUB_API = "https://api.github.com"
	_SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv",
	"dist", "build", ".next", ".nuxt", "coverage", "htmlcov"}
	_CODE_EXTS = {".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".rs", ".java",
	".rb", ".php", ".cs", ".cpp", ".c", ".h", ".swift", ".kt"}
	_DOC_EXTS = {".md", ".rst", ".txt", ".yaml", ".yml", ".toml", ".json"}
	_PRIORITY = ["README.md", "readme.md", "README.rst", "main.py", "app.py",
	"index.js", "index.ts", "main.go", "src/main.rs", "setup.py",
	"pyproject.toml", "package.json", "go.mod", "Cargo.toml"]


	def _headers():
	token = os.environ.get("GITHUB_TOKEN", "")
	h = {"Accept": "application/vnd.github.v3+json"}
	if token:
	h["Authorization"] = f"Bearer {token}"
	return h


	def parse_repo_url(url: str) -> tuple[str, str]:
	"""Return (owner, repo) from a GitHub URL or owner/repo string."""
	url = url.strip().rstrip("/")
	# Match github.com/owner/repo — ignore /tree/, /blob/, /issues/ etc.
	m = re.search(r"github\.com/([^/]+)/([^/?#\s]+)", url)
	if m:
	repo = m.group(2)
	if repo.endswith(".git"):
	repo = repo[:-4]
	return m.group(1), repo
	# Plain "owner/repo" shorthand
	parts = url.split("/")
	if len(parts) == 2 and parts[0] and parts[1]:
	return parts[0], parts[1]
	raise ValueError(f"Cannot parse GitHub URL: {url!r}")


	def get_repo_info(owner: str, repo: str) -> dict:
	r = requests.get(f"{_GITHUB_API}/repos/{owner}/{repo}",
	headers=_headers(), timeout=15)
	r.raise_for_status()
	d = r.json()
	return {
	"full_name": d.get("full_name", ""),
	"description": d.get("description", ""),
	"language": d.get("language", ""),
	"stars": d.get("stargazers_count", 0),
	"forks": d.get("forks_count", 0),
	"topics": d.get("topics", []),
	"default_branch": d.get("default_branch", "main"),
	"url": d.get("html_url", ""),
	}


	def get_file_tree(owner: str, repo: str, branch: str = "main",
	max_files: int = 150) -> list[str]:
	"""Return flat list of file paths, priority files first."""
	r = requests.get(
	f"{_GITHUB_API}/repos/{owner}/{repo}/git/trees/{branch}?recursive=1",
	headers=_headers(), timeout=20)
	if r.status_code == 404:
	# Try main vs master
	alt = "master" if branch == "main" else "main"
	r = requests.get(
	f"{_GITHUB_API}/repos/{owner}/{repo}/git/trees/{alt}?recursive=1",
	headers=_headers(), timeout=20)
	r.raise_for_status()
	blobs = [item["path"] for item in r.json().get("tree", [])
	if item["type"] == "blob"
	and not any(seg in _SKIP_DIRS for seg in item["path"].split("/"))]

	# Sort: priority first, then code, then docs, then rest
	def rank(p):
	name = p.split("/")[-1]
	if p in _PRIORITY or name in _PRIORITY:
	return 0
	ext = os.path.splitext(p)[1].lower()
	if ext in _CODE_EXTS:
	return 1
	if ext in _DOC_EXTS:
	return 2
	return 3

	return sorted(blobs, key=rank)[:max_files]


	def fetch_file(owner: str, repo: str, path: str, branch: str = "main") -> str:
	"""Fetch raw content of a single file (max 50KB)."""
	r = requests.get(
	f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}",
	headers=_headers(), timeout=15)
	if r.status_code != 200:
	return ""
	text = r.text
	return text[:50_000] # cap at 50KB per file


	def fetch_key_files(owner: str, repo: str, tree: list[str],
	branch: str = "main", max_chars: int = 60_000) -> dict[str, str]:
	"""Fetch the most important files up to max_chars total."""
	results: dict[str, str] = {}
	total = 0
	# Always try priority files first
	ordered = [p for p in _PRIORITY if p in tree] + [p for p in tree if p not in _PRIORITY]
	for path in ordered:
	if total >= max_chars:
	break
	content = fetch_file(owner, repo, path, branch)
	if content:
	results[path] = content
	total += len(content)
	return results