"""
CommitLens — gradio.Server mode
================================
- Serves custom index.html at GET /
- Exposes process_repo via @app.api() for the JS frontend to call
- Mellum 2 (6-bit, CPU-resident) handles per-file summaries via batched GPU inference
- Groq llama-70b handles the final report (fast, no GPU cost)
- <think>...</think> blocks stripped from all Mellum outputs
- Per-file output is tightly constrained to 3-5 bullet points max
"""

from __future__ import annotations

import logging
import os
import re
import sys
from pathlib import Path

import spaces
import torch
from fastapi.responses import HTMLResponse
from gradio import Server
from groq import Groq
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from commitlens import run_pipeline

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    stream=sys.stdout,
)
log = logging.getLogger("commitlens")

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------

MODEL_REPO_ID   = "JetBrains/Mellum2-12B-A2.5B-Instruct"
GROQ_MODEL      = "llama-3.3-70b-versatile"   # fast Groq-hosted 70B
# BATCH_TOKEN_BUDGET = 7000   # estimated input tokens; above this → sequential

# ---------------------------------------------------------------------------
# Prompts
# ---------------------------------------------------------------------------

# Tight, bullet-constrained prompt → short output → fewer tokens generated
SUMMARY_SYSTEM_PROMPT = """
You are a senior software engineer reviewing a git diff for ONE file.

Analyze the actual code changes and produce a concise technical review.

Output EXACTLY in this format:

Summary:
<2-4 sentences describing the code changes>

Reason:
<1 sentence explaining the reason if clearly evident from the diff, otherwise "Reason not evident from the diff.">

Observations:

* <observation>
* <observation>
* <observation>

Rules:

* Use ONLY information visible in the diff and provided code context.
* Refer to functions, classes, methods, imports, decorators, constants, configuration values, API calls, and control flow when relevant.
* Focus on what was actually modified, added, removed, or refactored.
* Mention risks, assumptions, limitations, edge cases, or behavioral changes when visible.
* Mention architectural or design changes when directly supported by the diff.
* Do NOT invent requirements, business goals, performance improvements, bug fixes, security improvements, or developer intent.
* If something cannot be proven from the diff, do not claim it.
* Avoid generic statements such as:
  "improves reliability"
  "improves scalability"
  "improves performance"
  unless explicitly supported by the code changes.
* Do not repeat the filename.
* No markdown headers beyond the required section names.
* No code fences.
* No chain-of-thought.
* No speculative reasoning.
* Target 80-180 words.
  """


FINAL_SYSTEM_PROMPT = """\
You are a technical writer producing a commit review report.

Given per-file summaries, write a structured markdown report with these exact sections:

## Commit Overview
One paragraph (3-5 sentences) summarising the overall intent of the commit.

## Changes Per File
A sub-section per file (### `filename`) with 2-4 bullet points.

## Key Takeaways
3-5 bullets: cross-cutting concerns, risks, follow-up actions.

Rules:
- Total report MUST be under 400 words
- No filler phrases ("In conclusion", "It is worth noting")
- Output markdown only — no preamble, no explanation
"""

# ---------------------------------------------------------------------------
# Global model state — CPU-resident between requests
# ---------------------------------------------------------------------------

_model:     AutoModelForCausalLM | None = None
_tokenizer: AutoTokenizer        | None = None


def _strip_thinking(text: str) -> str:
    """Remove <think>...</think> blocks (multiline) produced by thinking models."""
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()


def _extract_filename(prompt: str) -> str:
    for line in prompt.splitlines():
        if line.startswith("Filename :"):
            return line.split(":", 1)[1].strip()
    return "unknown"


# ---------------------------------------------------------------------------
# Startup: load Mellum 2 in 6-bit NF4 into CPU RAM
# Runs ONCE before app.launch(), outside any @spaces.GPU context.
# ---------------------------------------------------------------------------

def load_model_on_startup() -> None:
    """
    Load Mellum 2 into CPU RAM with 6-bit NF4 double quantization.
    device_map='cpu' keeps weights off-GPU until a @spaces.GPU call fires,
    satisfying ZeroGPU's requirement that GPU allocation only happens inside
    decorated functions.
    """
    global _model, _tokenizer

    log.info("=== STARTUP: loading tokenizer (%s) ===", MODEL_REPO_ID)
    _tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO_ID)
    if _tokenizer.pad_token_id is None:
        _tokenizer.pad_token_id = _tokenizer.eos_token_id
    log.info("Tokenizer ready. pad_token_id=%s", _tokenizer.pad_token_id)

    log.info("=== STARTUP: loading model in 6-bit NF4 on CPU ===")
    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,   # NF4 + double quant ≈ effective 6-bit
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    _model = AutoModelForCausalLM.from_pretrained(
        MODEL_REPO_ID,
        quantization_config=quant_cfg,
        device_map="cpu",
        torch_dtype=torch.bfloat16,
    )
    _model.eval()
    log.info("=== STARTUP: model ready on CPU ===")


# ---------------------------------------------------------------------------
# Mellum inference (called inside @spaces.GPU)
# ---------------------------------------------------------------------------

def _build_mellum_prompt(user_content: str) -> str:
    """Apply Mellum's chat template to a single user turn."""
    return _tokenizer.apply_chat_template(
        [
            {"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
            {"role": "user",   "content": user_content},
        ],
        tokenize=False,
        add_generation_prompt=True,
    )

def _generate_sequential(prompts: list[str]) -> list[str]:
    """Fallback single-prompt inference when batch would OOM."""
    log.info("Sequential inference: %d prompts", len(prompts))
    _tokenizer.padding_side = "right"
    results = []
    for i, prompt in enumerate(prompts):
        log.info("  [%d/%d]", i + 1, len(prompts))
        enc = _tokenizer(prompt, return_tensors="pt").to("cuda")
        with torch.no_grad():
            out = _model.generate(
                **enc,
                max_new_tokens=200,
                use_cache=True,
                do_sample=True,
                temperature=0.4,
                top_p=0.95,
                pad_token_id=_tokenizer.pad_token_id,
            )
        text = _tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True)
        results.append(_strip_thinking(text))
    return results

# ---------------------------------------------------------------------------
# Groq final report (pure API call — no GPU needed)
# ---------------------------------------------------------------------------

def _generate_final_report_groq(per_file_summaries: list[dict]) -> str:
    """
    Send all per-file summaries to Groq llama-3.3-70b and get back
    a structured markdown commit report. Fast (~2-4 s) and free of GPU cost.

    Reads GROQ_API_KEY from environment (set as a HF Space secret).
    """
    groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])

    # Format per-file summaries as a clean user message
    user_content = "\n\n".join(
        f"### `{f['name']}`\n{f['summary']}"
        for f in per_file_summaries
    )

    log.info("Calling Groq %s for final report (%d files) ...", GROQ_MODEL, len(per_file_summaries))
    response = groq_client.chat.completions.create(
        model=GROQ_MODEL,
        messages=[
            {"role": "system", "content": FINAL_SYSTEM_PROMPT},
            {"role": "user",   "content": user_content},
        ],
        max_tokens=600,       # 400-word cap + small buffer
        temperature=0.2,      # low temp for consistent, factual output
    )

    report = response.choices[0].message.content.strip()
    log.info("Groq report received (%d chars)", len(report))
    return report


# ---------------------------------------------------------------------------
# gradio.Server app
# ---------------------------------------------------------------------------

app = Server()


@app.get("/", response_class=HTMLResponse)
async def homepage():
    html_path = Path(__file__).parent / "index.html"
    return HTMLResponse(content=html_path.read_text(encoding="utf-8"))


@app.api(name="process_repo")
@spaces.GPU(duration=240)      
def process_repo(repo_url: str, token: str) -> dict:
    """
    Full pipeline:
      1. run_pipeline()  → Top 2 most changed file prompts   (CPU, fast)
      2. Mellum 2 sequential → per-file summaries (.md format) (GPU, sequential)
      3. Groq 70B        → final markdown summary report     (API, ~3 s)

    Returns: { "files": [{"name": str, "summary": str}], "report": str }
    """
    log.info("=== process_repo: %s ===", repo_url)
    _model.to("cuda")   # move model to GPU for Mellum inference
    
    # Step 1 — fetch diff and build prompts (Now limited to top 2 files from commitlens.py)
    prompts = run_pipeline(repo_url, token.strip() or None)
    log.info("Got %d file prompts from pipeline (capped at top 2)", len(prompts))
    if not prompts:
        raise ValueError("No matching source-code files changed in the latest commit.")

    fnames = [_extract_filename(p) for p in prompts]

    # Step 2 — Force sequential execution through Mellum 2 on GPU
    mellum_prompts = [_build_mellum_prompt(p) for p in prompts]
    summaries = _generate_sequential(mellum_prompts)

    file_results = [
        {"name": n, "summary": s}
        for n, s in zip(fnames, summaries)
    ]
    log.info("Sequential per-file summaries done")

    # Step 3 — Send the 2 .md summaries to Groq for final summary generation
    final_report = _generate_final_report_groq(file_results)

    log.info("Pipeline complete — processed top %d files", len(file_results))
    return {"files": file_results, "report": final_report}

# ---------------------------------------------------------------------------
# Boot
# ---------------------------------------------------------------------------

load_model_on_startup()   # weights land in CPU RAM; GPU untouched until first request

if __name__ == "__main__":
    log.info("Starting CommitLens ...")
    app.launch()