devkit / app /tools /doc_forge /github_fetcher.py
Mohammed AL Sarraj
initial deploy
950dcd2
"""Fetches repository structure and key file contents from GitHub API."""
import os
import re
import requests
_GITHUB_API = "https://api.github.com"
_SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv",
"dist", "build", ".next", ".nuxt", "coverage", "htmlcov"}
_CODE_EXTS = {".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".rs", ".java",
".rb", ".php", ".cs", ".cpp", ".c", ".h", ".swift", ".kt"}
_DOC_EXTS = {".md", ".rst", ".txt", ".yaml", ".yml", ".toml", ".json"}
_PRIORITY = ["README.md", "readme.md", "README.rst", "main.py", "app.py",
"index.js", "index.ts", "main.go", "src/main.rs", "setup.py",
"pyproject.toml", "package.json", "go.mod", "Cargo.toml"]
def _headers():
token = os.environ.get("GITHUB_TOKEN", "")
h = {"Accept": "application/vnd.github.v3+json"}
if token:
h["Authorization"] = f"Bearer {token}"
return h
def parse_repo_url(url: str) -> tuple[str, str]:
"""Return (owner, repo) from a GitHub URL or owner/repo string."""
url = url.strip().rstrip("/")
# Match github.com/owner/repo — ignore /tree/, /blob/, /issues/ etc.
m = re.search(r"github\.com/([^/]+)/([^/?#\s]+)", url)
if m:
repo = m.group(2)
if repo.endswith(".git"):
repo = repo[:-4]
return m.group(1), repo
# Plain "owner/repo" shorthand
parts = url.split("/")
if len(parts) == 2 and parts[0] and parts[1]:
return parts[0], parts[1]
raise ValueError(f"Cannot parse GitHub URL: {url!r}")
def get_repo_info(owner: str, repo: str) -> dict:
r = requests.get(f"{_GITHUB_API}/repos/{owner}/{repo}",
headers=_headers(), timeout=15)
r.raise_for_status()
d = r.json()
return {
"full_name": d.get("full_name", ""),
"description": d.get("description", ""),
"language": d.get("language", ""),
"stars": d.get("stargazers_count", 0),
"forks": d.get("forks_count", 0),
"topics": d.get("topics", []),
"default_branch": d.get("default_branch", "main"),
"url": d.get("html_url", ""),
}
def get_file_tree(owner: str, repo: str, branch: str = "main",
max_files: int = 150) -> list[str]:
"""Return flat list of file paths, priority files first."""
r = requests.get(
f"{_GITHUB_API}/repos/{owner}/{repo}/git/trees/{branch}?recursive=1",
headers=_headers(), timeout=20)
if r.status_code == 404:
# Try main vs master
alt = "master" if branch == "main" else "main"
r = requests.get(
f"{_GITHUB_API}/repos/{owner}/{repo}/git/trees/{alt}?recursive=1",
headers=_headers(), timeout=20)
r.raise_for_status()
blobs = [item["path"] for item in r.json().get("tree", [])
if item["type"] == "blob"
and not any(seg in _SKIP_DIRS for seg in item["path"].split("/"))]
# Sort: priority first, then code, then docs, then rest
def rank(p):
name = p.split("/")[-1]
if p in _PRIORITY or name in _PRIORITY:
return 0
ext = os.path.splitext(p)[1].lower()
if ext in _CODE_EXTS:
return 1
if ext in _DOC_EXTS:
return 2
return 3
return sorted(blobs, key=rank)[:max_files]
def fetch_file(owner: str, repo: str, path: str, branch: str = "main") -> str:
"""Fetch raw content of a single file (max 50KB)."""
r = requests.get(
f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}",
headers=_headers(), timeout=15)
if r.status_code != 200:
return ""
text = r.text
return text[:50_000] # cap at 50KB per file
def fetch_key_files(owner: str, repo: str, tree: list[str],
branch: str = "main", max_chars: int = 60_000) -> dict[str, str]:
"""Fetch the most important files up to max_chars total."""
results: dict[str, str] = {}
total = 0
# Always try priority files first
ordered = [p for p in _PRIORITY if p in tree] + [p for p in tree if p not in _PRIORITY]
for path in ordered:
if total >= max_chars:
break
content = fetch_file(owner, repo, path, branch)
if content:
results[path] = content
total += len(content)
return results