Spaces:
Sleeping
Sleeping
| """Fetches repository structure and key file contents from GitHub API.""" | |
| import os | |
| import re | |
| import requests | |
| _GITHUB_API = "https://api.github.com" | |
| _SKIP_DIRS = {".git", "node_modules", "__pycache__", ".venv", "venv", | |
| "dist", "build", ".next", ".nuxt", "coverage", "htmlcov"} | |
| _CODE_EXTS = {".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".rs", ".java", | |
| ".rb", ".php", ".cs", ".cpp", ".c", ".h", ".swift", ".kt"} | |
| _DOC_EXTS = {".md", ".rst", ".txt", ".yaml", ".yml", ".toml", ".json"} | |
| _PRIORITY = ["README.md", "readme.md", "README.rst", "main.py", "app.py", | |
| "index.js", "index.ts", "main.go", "src/main.rs", "setup.py", | |
| "pyproject.toml", "package.json", "go.mod", "Cargo.toml"] | |
| def _headers(): | |
| token = os.environ.get("GITHUB_TOKEN", "") | |
| h = {"Accept": "application/vnd.github.v3+json"} | |
| if token: | |
| h["Authorization"] = f"Bearer {token}" | |
| return h | |
| def parse_repo_url(url: str) -> tuple[str, str]: | |
| """Return (owner, repo) from a GitHub URL or owner/repo string.""" | |
| url = url.strip().rstrip("/") | |
| # Match github.com/owner/repo — ignore /tree/, /blob/, /issues/ etc. | |
| m = re.search(r"github\.com/([^/]+)/([^/?#\s]+)", url) | |
| if m: | |
| repo = m.group(2) | |
| if repo.endswith(".git"): | |
| repo = repo[:-4] | |
| return m.group(1), repo | |
| # Plain "owner/repo" shorthand | |
| parts = url.split("/") | |
| if len(parts) == 2 and parts[0] and parts[1]: | |
| return parts[0], parts[1] | |
| raise ValueError(f"Cannot parse GitHub URL: {url!r}") | |
| def get_repo_info(owner: str, repo: str) -> dict: | |
| r = requests.get(f"{_GITHUB_API}/repos/{owner}/{repo}", | |
| headers=_headers(), timeout=15) | |
| r.raise_for_status() | |
| d = r.json() | |
| return { | |
| "full_name": d.get("full_name", ""), | |
| "description": d.get("description", ""), | |
| "language": d.get("language", ""), | |
| "stars": d.get("stargazers_count", 0), | |
| "forks": d.get("forks_count", 0), | |
| "topics": d.get("topics", []), | |
| "default_branch": d.get("default_branch", "main"), | |
| "url": d.get("html_url", ""), | |
| } | |
| def get_file_tree(owner: str, repo: str, branch: str = "main", | |
| max_files: int = 150) -> list[str]: | |
| """Return flat list of file paths, priority files first.""" | |
| r = requests.get( | |
| f"{_GITHUB_API}/repos/{owner}/{repo}/git/trees/{branch}?recursive=1", | |
| headers=_headers(), timeout=20) | |
| if r.status_code == 404: | |
| # Try main vs master | |
| alt = "master" if branch == "main" else "main" | |
| r = requests.get( | |
| f"{_GITHUB_API}/repos/{owner}/{repo}/git/trees/{alt}?recursive=1", | |
| headers=_headers(), timeout=20) | |
| r.raise_for_status() | |
| blobs = [item["path"] for item in r.json().get("tree", []) | |
| if item["type"] == "blob" | |
| and not any(seg in _SKIP_DIRS for seg in item["path"].split("/"))] | |
| # Sort: priority first, then code, then docs, then rest | |
| def rank(p): | |
| name = p.split("/")[-1] | |
| if p in _PRIORITY or name in _PRIORITY: | |
| return 0 | |
| ext = os.path.splitext(p)[1].lower() | |
| if ext in _CODE_EXTS: | |
| return 1 | |
| if ext in _DOC_EXTS: | |
| return 2 | |
| return 3 | |
| return sorted(blobs, key=rank)[:max_files] | |
| def fetch_file(owner: str, repo: str, path: str, branch: str = "main") -> str: | |
| """Fetch raw content of a single file (max 50KB).""" | |
| r = requests.get( | |
| f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}", | |
| headers=_headers(), timeout=15) | |
| if r.status_code != 200: | |
| return "" | |
| text = r.text | |
| return text[:50_000] # cap at 50KB per file | |
| def fetch_key_files(owner: str, repo: str, tree: list[str], | |
| branch: str = "main", max_chars: int = 60_000) -> dict[str, str]: | |
| """Fetch the most important files up to max_chars total.""" | |
| results: dict[str, str] = {} | |
| total = 0 | |
| # Always try priority files first | |
| ordered = [p for p in _PRIORITY if p in tree] + [p for p in tree if p not in _PRIORITY] | |
| for path in ordered: | |
| if total >= max_chars: | |
| break | |
| content = fetch_file(owner, repo, path, branch) | |
| if content: | |
| results[path] = content | |
| total += len(content) | |
| return results | |