"""Fetches repository structure and key file contents from GitHub API."""
import os
import re
import requests

_GITHUB_API = "https://api.github.com"
_SKIP_DIRS  = {".git", "node_modules", "__pycache__", ".venv", "venv",
               "dist", "build", ".next", ".nuxt", "coverage", "htmlcov"}
_CODE_EXTS  = {".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".rs", ".java",
               ".rb", ".php", ".cs", ".cpp", ".c", ".h", ".swift", ".kt"}
_DOC_EXTS   = {".md", ".rst", ".txt", ".yaml", ".yml", ".toml", ".json"}
_PRIORITY   = ["README.md", "readme.md", "README.rst", "main.py", "app.py",
               "index.js", "index.ts", "main.go", "src/main.rs", "setup.py",
               "pyproject.toml", "package.json", "go.mod", "Cargo.toml"]


def _headers():
    token = os.environ.get("GITHUB_TOKEN", "")
    h = {"Accept": "application/vnd.github.v3+json"}
    if token:
        h["Authorization"] = f"Bearer {token}"
    return h


def parse_repo_url(url: str) -> tuple[str, str]:
    """Return (owner, repo) from a GitHub URL or owner/repo string."""
    url = url.strip().rstrip("/")
    # Match github.com/owner/repo — ignore /tree/, /blob/, /issues/ etc.
    m = re.search(r"github\.com/([^/]+)/([^/?#\s]+)", url)
    if m:
        repo = m.group(2)
        if repo.endswith(".git"):
            repo = repo[:-4]
        return m.group(1), repo
    # Plain "owner/repo" shorthand
    parts = url.split("/")
    if len(parts) == 2 and parts[0] and parts[1]:
        return parts[0], parts[1]
    raise ValueError(f"Cannot parse GitHub URL: {url!r}")


def get_repo_info(owner: str, repo: str) -> dict:
    r = requests.get(f"{_GITHUB_API}/repos/{owner}/{repo}",
                     headers=_headers(), timeout=15)
    r.raise_for_status()
    d = r.json()
    return {
        "full_name":    d.get("full_name", ""),
        "description":  d.get("description", ""),
        "language":     d.get("language", ""),
        "stars":        d.get("stargazers_count", 0),
        "forks":        d.get("forks_count", 0),
        "topics":       d.get("topics", []),
        "default_branch": d.get("default_branch", "main"),
        "url":          d.get("html_url", ""),
    }


def get_file_tree(owner: str, repo: str, branch: str = "main",
                  max_files: int = 150) -> list[str]:
    """Return flat list of file paths, priority files first."""
    r = requests.get(
        f"{_GITHUB_API}/repos/{owner}/{repo}/git/trees/{branch}?recursive=1",
        headers=_headers(), timeout=20)
    if r.status_code == 404:
        # Try main vs master
        alt = "master" if branch == "main" else "main"
        r = requests.get(
            f"{_GITHUB_API}/repos/{owner}/{repo}/git/trees/{alt}?recursive=1",
            headers=_headers(), timeout=20)
    r.raise_for_status()
    blobs = [item["path"] for item in r.json().get("tree", [])
             if item["type"] == "blob"
             and not any(seg in _SKIP_DIRS for seg in item["path"].split("/"))]

    # Sort: priority first, then code, then docs, then rest
    def rank(p):
        name = p.split("/")[-1]
        if p in _PRIORITY or name in _PRIORITY:
            return 0
        ext = os.path.splitext(p)[1].lower()
        if ext in _CODE_EXTS:
            return 1
        if ext in _DOC_EXTS:
            return 2
        return 3

    return sorted(blobs, key=rank)[:max_files]


def fetch_file(owner: str, repo: str, path: str, branch: str = "main") -> str:
    """Fetch raw content of a single file (max 50KB)."""
    r = requests.get(
        f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/{path}",
        headers=_headers(), timeout=15)
    if r.status_code != 200:
        return ""
    text = r.text
    return text[:50_000]  # cap at 50KB per file


def fetch_key_files(owner: str, repo: str, tree: list[str],
                    branch: str = "main", max_chars: int = 60_000) -> dict[str, str]:
    """Fetch the most important files up to max_chars total."""
    results: dict[str, str] = {}
    total = 0
    # Always try priority files first
    ordered = [p for p in _PRIORITY if p in tree] + [p for p in tree if p not in _PRIORITY]
    for path in ordered:
        if total >= max_chars:
            break
        content = fetch_file(owner, repo, path, branch)
        if content:
            results[path] = content
            total += len(content)
    return results