"""
GitHub Repository Intelligence Analyzer
Core analysis engine — scoring, complexity, and classification logic.

Author: GSoC 2026 Pre-Task Submission
"""

import os
import math
import datetime
import requests
from dataclasses import dataclass, field
from typing import Optional


# ──────────────────────────────────────────────
# Data model
# ──────────────────────────────────────────────


@dataclass
class RepoReport:
    url: str
    owner: str
    name: str

    # Raw GitHub data
    stars: int = 0
    forks: int = 0
    open_issues: int = 0
    watchers: int = 0
    size_kb: int = 0
    language: Optional[str] = None
    languages: dict = field(default_factory=dict)
    topics: list = field(default_factory=list)
    license: Optional[str] = None
    created_at: Optional[str] = None
    updated_at: Optional[str] = None
    pushed_at: Optional[str] = None
    default_branch: str = "main"
    has_wiki: bool = False
    has_pages: bool = False
    archived: bool = False

    # Fetched separately
    contributor_count: int = 0
    commit_count_recent: int = 0  # commits in last 90 days
    release_count: int = 0
    has_ci: bool = False
    dependency_files: list = field(default_factory=list)
    file_count: int = 0

    # Computed scores
    activity_score: float = 0.0
    complexity_score: float = 0.0
    difficulty: str = "Unknown"
    fetch_error: Optional[str] = None


# ──────────────────────────────────────────────
# GitHub API client
# ──────────────────────────────────────────────

DEPENDENCY_FILES = [
    "requirements.txt",
    "Pipfile",
    "pyproject.toml",  # Python
    "package.json",
    "yarn.lock",
    "pnpm-lock.yaml",  # JS/TS
    "Cargo.toml",
    "go.mod",
    "pom.xml",
    "build.gradle",  # Rust/Go/Java
    "Gemfile",
    "composer.json",
    "mix.exs",  # Ruby/PHP/Elixir
]

CI_PATHS = [
    ".github/workflows",
    ".travis.yml",
    ".circleci",
    "Jenkinsfile",
    ".gitlab-ci.yml",
    "azure-pipelines.yml",
]


class GitHubClient:
    BASE = "https://api.github.com"

    def __init__(self, token: Optional[str] = None):
        self.session = requests.Session()
        self.session.headers.update(
            {
                "Accept": "application/vnd.github+json",
                "X-GitHub-Api-Version": "2022-11-28",
            }
        )
        if token:
            self.session.headers["Authorization"] = f"Bearer {token}"

    def _get(self, path: str, params: dict = None) -> Optional[dict | list]:
        """Safe GET with rate-limit awareness."""
        url = f"{self.BASE}{path}"
        try:
            resp = self.session.get(url, params=params, timeout=15)
            if resp.status_code == 403:
                remaining = resp.headers.get("X-RateLimit-Remaining", "?")
                reset = resp.headers.get("X-RateLimit-Reset", "?")
                raise RateLimitError(
                    f"Rate limit hit. Remaining: {remaining}. Reset epoch: {reset}"
                )
            if resp.status_code == 404:
                return None
            resp.raise_for_status()
            return resp.json()
        except RateLimitError:
            raise
        except requests.RequestException as e:
            raise FetchError(str(e)) from e

    def repo(self, owner: str, name: str) -> Optional[dict]:
        return self._get(f"/repos/{owner}/{name}")

    def languages(self, owner: str, name: str) -> dict:
        data = self._get(f"/repos/{owner}/{name}/languages")
        return data if isinstance(data, dict) else {}

    def contributors(self, owner: str, name: str) -> int:
        """Count contributors — uses pagination header trick for speed."""
        resp = self.session.get(
            f"{self.BASE}/repos/{owner}/{name}/contributors",
            params={"per_page": 1, "anon": "true"},
            timeout=15,
        )
        if resp.status_code in (403, 404, 204):
            return 0
        # GitHub returns last-page number in Link header
        link = resp.headers.get("Link", "")
        if 'rel="last"' in link:
            try:
                last_part = [p for p in link.split(",") if 'rel="last"' in p][0]
                page_num = int(last_part.split("page=")[-1].split(">")[0])
                return page_num
            except (IndexError, ValueError):
                pass
        try:
            return len(resp.json())
        except Exception:
            return 0

    def recent_commits(self, owner: str, name: str, days: int = 90) -> int:
        since = (
            datetime.datetime.utcnow() - datetime.timedelta(days=days)
        ).isoformat() + "Z"
        # Use commits endpoint with since filter; count via pagination
        resp = self.session.get(
            f"{self.BASE}/repos/{owner}/{name}/commits",
            params={"per_page": 1, "since": since},
            timeout=15,
        )
        if resp.status_code in (403, 404, 409):  # 409 = empty repo
            return 0
        link = resp.headers.get("Link", "")
        if 'rel="last"' in link:
            try:
                last_part = [p for p in link.split(",") if 'rel="last"' in p][0]
                return int(last_part.split("page=")[-1].split(">")[0])
            except (IndexError, ValueError):
                pass
        try:
            return len(resp.json())
        except Exception:
            return 0

    def releases(self, owner: str, name: str) -> int:
        data = self._get(f"/repos/{owner}/{name}/releases", params={"per_page": 100})
        return len(data) if isinstance(data, list) else 0

    def tree_summary(
        self, owner: str, name: str, branch: str
    ) -> tuple[int, list, bool]:
        """Return (file_count, dependency_files_found, has_ci)."""
        data = self._get(
            f"/repos/{owner}/{name}/git/trees/{branch}",
            params={"recursive": "1"},
        )
        if not data or "tree" not in data:
            return 0, [], False

        paths = [item["path"] for item in data["tree"] if item["type"] == "blob"]
        file_count = len(paths)

        # Dependency detection (check file names, not full paths)
        filenames = {p.split("/")[-1] for p in paths}
        dep_found = [f for f in DEPENDENCY_FILES if f in filenames]

        # CI detection
        has_ci = any(any(p.startswith(ci) for p in paths) for ci in CI_PATHS)

        return file_count, dep_found, has_ci


class RateLimitError(Exception):
    pass


class FetchError(Exception):
    pass


# ──────────────────────────────────────────────
# Scoring engine
# ──────────────────────────────────────────────


def _days_since(iso_str: Optional[str]) -> float:
    """Days elapsed since an ISO 8601 timestamp. Returns large number if None."""
    if not iso_str:
        return 9999
    try:
        dt = datetime.datetime.fromisoformat(iso_str.replace("Z", "+00:00"))
        delta = datetime.datetime.now(datetime.timezone.utc) - dt
        return delta.total_seconds() / 86400
    except Exception:
        return 9999


def compute_activity_score(report: RepoReport) -> float:
    """
    Activity Score (0–100): measures how alive and maintained a repo is.

    Formula (weighted sum, each component 0–1, then ×100):

      Component                 Weight  Rationale
      ─────────────────────────────────────────────────────────
      recent_commits (90d)       0.30   Primary signal of active dev
      contributor_count          0.20   Community health
      stars (log-scaled)         0.15   Popularity / interest
      forks (log-scaled)         0.10   Adoption / downstream use
      open_issues                0.10   Engagement (capped)
      recency (days since push)  0.10   Freshness
      releases                   0.05   Delivery maturity

    Caps prevent one outlier metric from dominating.
    """

    def log_scale(value: float, cap: int) -> float:
        """Map value to [0,1] using log scale with a cap."""
        if value <= 0:
            return 0.0
        return min(math.log1p(value) / math.log1p(cap), 1.0)

    def recency_score(days: float) -> float:
        """1.0 if pushed today, decays to 0.0 at 365 days."""
        return max(0.0, 1.0 - days / 365.0)

    components = {
        "recent_commits": (log_scale(report.commit_count_recent, 500), 0.30),
        "contributors": (log_scale(report.contributor_count, 200), 0.20),
        "stars": (log_scale(report.stars, 5000), 0.15),
        "forks": (log_scale(report.forks, 1000), 0.10),
        "open_issues": (log_scale(report.open_issues, 100), 0.10),
        "recency": (recency_score(_days_since(report.pushed_at)), 0.10),
        "releases": (log_scale(report.release_count, 50), 0.05),
    }

    score = sum(val * weight for val, weight in components.values())
    return round(score * 100, 2)


def compute_complexity_score(report: RepoReport) -> float:
    """
    Complexity Score (0–100): estimates technical depth of the repo.

      Component                 Weight  Rationale
      ─────────────────────────────────────────────────────────
      file_count (log-scaled)    0.30   Codebase size
      language_diversity         0.25   Multi-lang = more complexity
      dependency_count           0.20   External surface area
      repo_size_kb (log)         0.15   Raw size proxy
      has_ci                     0.10   Engineering maturity signal
    """

    def log_scale(value: float, cap: int) -> float:
        if value <= 0:
            return 0.0
        return min(math.log1p(value) / math.log1p(cap), 1.0)

    lang_count = len(report.languages)
    lang_diversity = log_scale(lang_count, 10)

    dep_count = len(report.dependency_files)
    dep_score = log_scale(dep_count, 8)

    components = {
        "file_count": (log_scale(report.file_count, 2000), 0.30),
        "lang_diversity": (lang_diversity, 0.25),
        "dependencies": (dep_score, 0.20),
        "size_kb": (log_scale(report.size_kb, 100_000), 0.15),
        "has_ci": (1.0 if report.has_ci else 0.0, 0.10),
    }

    score = sum(val * weight for val, weight in components.values())
    return round(score * 100, 2)


def classify_difficulty(activity: float, complexity: float) -> str:
    """
    Classify learning difficulty using a 2D grid:

      High complexity + High activity  → Advanced
      Either metric high               → Intermediate
      Both metrics low                 → Beginner

    Thresholds (tuned empirically):
      Low    : < 30
      Medium : 30–60
      High   : > 60
    """
    avg = (activity + complexity) / 2

    # Advanced: high complexity AND high activity
    if complexity >= 70 and activity >= 75:
        return "Advanced"
    # Intermediate: moderate-to-high on either dimension
    elif avg >= 50 or complexity >= 55 or activity >= 60:
        return "Intermediate"
    # Beginner: both metrics low
    else:
        return "Beginner"


# ──────────────────────────────────────────────
# Main analysis pipeline
# ──────────────────────────────────────────────


def parse_github_url(url: str) -> tuple[str, str]:
    """Extract (owner, repo) from various GitHub URL formats."""
    url = url.strip().rstrip("/")
    # Remove trailing .git
    if url.endswith(".git"):
        url = url[:-4]
    parts = url.replace("https://", "").replace("http://", "").split("/")
    # Filter out empty strings and 'github.com'
    parts = [p for p in parts if p and p != "github.com"]
    if len(parts) < 2:
        raise ValueError(f"Cannot parse GitHub URL: {url!r}")
    return parts[0], parts[1]


def analyze_repo(url: str, client: GitHubClient) -> RepoReport:
    """Full pipeline: fetch → score → classify → return report."""
    try:
        owner, name = parse_github_url(url)
    except ValueError as e:
        return RepoReport(url=url, owner="?", name="?", fetch_error=str(e))

    report = RepoReport(url=url, owner=owner, name=name)

    try:
        # 1. Core repo data
        data = client.repo(owner, name)
        if data is None:
            report.fetch_error = (
                "Repository not found (404). Check the URL or repo visibility."
            )
            return report

        report.stars = data.get("stargazers_count", 0)
        report.forks = data.get("forks_count", 0)
        report.open_issues = data.get("open_issues_count", 0)
        report.watchers = data.get("watchers_count", 0)
        report.size_kb = data.get("size", 0)
        report.language = data.get("language")
        report.license = (data.get("license") or {}).get("spdx_id")
        report.created_at = data.get("created_at")
        report.updated_at = data.get("updated_at")
        report.pushed_at = data.get("pushed_at")
        report.default_branch = data.get("default_branch", "main")
        report.has_wiki = data.get("has_wiki", False)
        report.has_pages = data.get("has_pages", False)
        report.archived = data.get("archived", False)
        report.topics = data.get("topics", [])

        # 2. Languages
        report.languages = client.languages(owner, name)

        # 3. Contributors
        report.contributor_count = client.contributors(owner, name)

        # 4. Recent commits (90 days)
        report.commit_count_recent = client.recent_commits(owner, name)

        # 5. Releases
        report.release_count = client.releases(owner, name)

        # 6. File tree analysis
        fc, deps, has_ci = client.tree_summary(owner, name, report.default_branch)
        report.file_count = fc
        report.dependency_files = deps
        report.has_ci = has_ci

    except RateLimitError as e:
        report.fetch_error = f"GitHub rate limit: {e}"
        return report
    except FetchError as e:
        report.fetch_error = f"Network error: {e}"
        return report
    except Exception as e:
        report.fetch_error = f"Unexpected error: {e}"
        return report

    # 7. Compute scores
    report.activity_score = compute_activity_score(report)
    report.complexity_score = compute_complexity_score(report)
    report.difficulty = classify_difficulty(
        report.activity_score, report.complexity_score
    )

    return report