File size: 10,926 Bytes
5a0d646
ee5892f
 
 
 
cfdaeb1
 
 
 
5a0d646
 
 
 
1dc7db9
cfdaeb1
 
1dc7db9
ee5892f
1dc7db9
cfdaeb1
2d5d362
ee5892f
 
cfdaeb1
2d5d362
5a0d646
 
 
1dc7db9
 
 
 
 
 
 
5a0d646
cfdaeb1
5a0d646
 
49fb46a
cfdaeb1
634b44d
5a0d646
cfdaeb1
 
 
5a0d646
cfdaeb1
2133d08
 
 
 
 
 
 
 
 
634b44d
 
2133d08
 
 
 
 
 
 
cfdaeb1
 
2133d08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cfdaeb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a0d646
 
cfdaeb1
5a0d646
 
cfdaeb1
 
5a0d646
1d719f3
cfdaeb1
 
 
5a0d646
1d719f3
5a0d646
 
 
cfdaeb1
5a0d646
 
1d719f3
cfdaeb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5a0d646
cfdaeb1
 
 
 
 
ee5892f
cfdaeb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d5d362
5a0d646
cfdaeb1
 
 
 
 
 
 
 
 
 
 
582d969
cfdaeb1
49fb46a
b726029
 
cfdaeb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2d5d362
cfdaeb1
 
 
 
 
 
 
 
 
 
1d719f3
cfdaeb1
 
 
5a0d646
1d719f3
5a0d646
ee5892f
5a0d646
 
ee5892f
 
 
 
 
 
 
 
 
 
eadc788
ee5892f
 
cfdaeb1
eadc788
 
 
5a0d646
cfdaeb1
ee5892f
cfdaeb1
eadc788
 
 
cfdaeb1
eadc788
ee5892f
eadc788
5a0d646
cfdaeb1
5a0d646
eadc788
cfdaeb1
eadc788
cfdaeb1
 
 
 
 
eadc788
5a0d646
eadc788
cfdaeb1
5a0d646
eadc788
ee5892f
5a0d646
cfdaeb1
 
 
 
 
 
5a0d646
cfdaeb1
ee5892f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
"""
CommitLens β€” gradio.Server mode
================================
- Serves custom index.html at GET /
- Exposes process_repo via @app.api() for the JS frontend to call
- Mellum 2 (6-bit, CPU-resident) handles per-file summaries via batched GPU inference
- Groq llama-70b handles the final report (fast, no GPU cost)
- <think>...</think> blocks stripped from all Mellum outputs
- Per-file output is tightly constrained to 3-5 bullet points max
"""

from __future__ import annotations

import logging
import os
import re
import sys
from pathlib import Path

import spaces
import torch
from fastapi.responses import HTMLResponse
from gradio import Server
from groq import Groq
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

from commitlens import run_pipeline

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
    stream=sys.stdout,
)
log = logging.getLogger("commitlens")

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------

MODEL_REPO_ID   = "JetBrains/Mellum2-12B-A2.5B-Instruct"
GROQ_MODEL      = "llama-3.3-70b-versatile"   # fast Groq-hosted 70B
# BATCH_TOKEN_BUDGET = 7000   # estimated input tokens; above this β†’ sequential

# ---------------------------------------------------------------------------
# Prompts
# ---------------------------------------------------------------------------

# Tight, bullet-constrained prompt β†’ short output β†’ fewer tokens generated
SUMMARY_SYSTEM_PROMPT = """
You are a senior software engineer reviewing a git diff for ONE file.

Analyze the actual code changes and produce a concise technical review.

Output EXACTLY in this format:

Summary:
<2-4 sentences describing the code changes>

Reason:
<1 sentence explaining the reason if clearly evident from the diff, otherwise "Reason not evident from the diff.">

Observations:

* <observation>
* <observation>
* <observation>

Rules:

* Use ONLY information visible in the diff and provided code context.
* Refer to functions, classes, methods, imports, decorators, constants, configuration values, API calls, and control flow when relevant.
* Focus on what was actually modified, added, removed, or refactored.
* Mention risks, assumptions, limitations, edge cases, or behavioral changes when visible.
* Mention architectural or design changes when directly supported by the diff.
* Do NOT invent requirements, business goals, performance improvements, bug fixes, security improvements, or developer intent.
* If something cannot be proven from the diff, do not claim it.
* Avoid generic statements such as:
  "improves reliability"
  "improves scalability"
  "improves performance"
  unless explicitly supported by the code changes.
* Do not repeat the filename.
* No markdown headers beyond the required section names.
* No code fences.
* No chain-of-thought.
* No speculative reasoning.
* Target 80-180 words.
  """


FINAL_SYSTEM_PROMPT = """\
You are a technical writer producing a commit review report.

Given per-file summaries, write a structured markdown report with these exact sections:

## Commit Overview
One paragraph (3-5 sentences) summarising the overall intent of the commit.

## Changes Per File
A sub-section per file (### `filename`) with 2-4 bullet points.

## Key Takeaways
3-5 bullets: cross-cutting concerns, risks, follow-up actions.

Rules:
- Total report MUST be under 400 words
- No filler phrases ("In conclusion", "It is worth noting")
- Output markdown only β€” no preamble, no explanation
"""

# ---------------------------------------------------------------------------
# Global model state β€” CPU-resident between requests
# ---------------------------------------------------------------------------

_model:     AutoModelForCausalLM | None = None
_tokenizer: AutoTokenizer        | None = None


def _strip_thinking(text: str) -> str:
    """Remove <think>...</think> blocks (multiline) produced by thinking models."""
    return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()


def _extract_filename(prompt: str) -> str:
    for line in prompt.splitlines():
        if line.startswith("Filename :"):
            return line.split(":", 1)[1].strip()
    return "unknown"


# ---------------------------------------------------------------------------
# Startup: load Mellum 2 in 6-bit NF4 into CPU RAM
# Runs ONCE before app.launch(), outside any @spaces.GPU context.
# ---------------------------------------------------------------------------

def load_model_on_startup() -> None:
    """
    Load Mellum 2 into CPU RAM with 6-bit NF4 double quantization.
    device_map='cpu' keeps weights off-GPU until a @spaces.GPU call fires,
    satisfying ZeroGPU's requirement that GPU allocation only happens inside
    decorated functions.
    """
    global _model, _tokenizer

    log.info("=== STARTUP: loading tokenizer (%s) ===", MODEL_REPO_ID)
    _tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO_ID)
    if _tokenizer.pad_token_id is None:
        _tokenizer.pad_token_id = _tokenizer.eos_token_id
    log.info("Tokenizer ready. pad_token_id=%s", _tokenizer.pad_token_id)

    log.info("=== STARTUP: loading model in 6-bit NF4 on CPU ===")
    quant_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,   # NF4 + double quant β‰ˆ effective 6-bit
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    _model = AutoModelForCausalLM.from_pretrained(
        MODEL_REPO_ID,
        quantization_config=quant_cfg,
        device_map="cpu",
        torch_dtype=torch.bfloat16,
    )
    _model.eval()
    log.info("=== STARTUP: model ready on CPU ===")


# ---------------------------------------------------------------------------
# Mellum inference (called inside @spaces.GPU)
# ---------------------------------------------------------------------------

def _build_mellum_prompt(user_content: str) -> str:
    """Apply Mellum's chat template to a single user turn."""
    return _tokenizer.apply_chat_template(
        [
            {"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
            {"role": "user",   "content": user_content},
        ],
        tokenize=False,
        add_generation_prompt=True,
    )

def _generate_sequential(prompts: list[str]) -> list[str]:
    """Fallback single-prompt inference when batch would OOM."""
    log.info("Sequential inference: %d prompts", len(prompts))
    _tokenizer.padding_side = "right"
    results = []
    for i, prompt in enumerate(prompts):
        log.info("  [%d/%d]", i + 1, len(prompts))
        enc = _tokenizer(prompt, return_tensors="pt").to("cuda")
        with torch.no_grad():
            out = _model.generate(
                **enc,
                max_new_tokens=200,
                use_cache=True,
                do_sample=True,
                temperature=0.4,
                top_p=0.95,
                pad_token_id=_tokenizer.pad_token_id,
            )
        text = _tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True)
        results.append(_strip_thinking(text))
    return results

# ---------------------------------------------------------------------------
# Groq final report (pure API call β€” no GPU needed)
# ---------------------------------------------------------------------------

def _generate_final_report_groq(per_file_summaries: list[dict]) -> str:
    """
    Send all per-file summaries to Groq llama-3.3-70b and get back
    a structured markdown commit report. Fast (~2-4 s) and free of GPU cost.

    Reads GROQ_API_KEY from environment (set as a HF Space secret).
    """
    groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])

    # Format per-file summaries as a clean user message
    user_content = "\n\n".join(
        f"### `{f['name']}`\n{f['summary']}"
        for f in per_file_summaries
    )

    log.info("Calling Groq %s for final report (%d files) ...", GROQ_MODEL, len(per_file_summaries))
    response = groq_client.chat.completions.create(
        model=GROQ_MODEL,
        messages=[
            {"role": "system", "content": FINAL_SYSTEM_PROMPT},
            {"role": "user",   "content": user_content},
        ],
        max_tokens=600,       # 400-word cap + small buffer
        temperature=0.2,      # low temp for consistent, factual output
    )

    report = response.choices[0].message.content.strip()
    log.info("Groq report received (%d chars)", len(report))
    return report


# ---------------------------------------------------------------------------
# gradio.Server app
# ---------------------------------------------------------------------------

app = Server()


@app.get("/", response_class=HTMLResponse)
async def homepage():
    html_path = Path(__file__).parent / "index.html"
    return HTMLResponse(content=html_path.read_text(encoding="utf-8"))


@app.api(name="process_repo")
@spaces.GPU(duration=240)      
def process_repo(repo_url: str, token: str) -> dict:
    """
    Full pipeline:
      1. run_pipeline()  β†’ Top 2 most changed file prompts   (CPU, fast)
      2. Mellum 2 sequential β†’ per-file summaries (.md format) (GPU, sequential)
      3. Groq 70B        β†’ final markdown summary report     (API, ~3 s)

    Returns: { "files": [{"name": str, "summary": str}], "report": str }
    """
    log.info("=== process_repo: %s ===", repo_url)
    _model.to("cuda")   # move model to GPU for Mellum inference
    
    # Step 1 β€” fetch diff and build prompts (Now limited to top 2 files from commitlens.py)
    prompts = run_pipeline(repo_url, token.strip() or None)
    log.info("Got %d file prompts from pipeline (capped at top 2)", len(prompts))
    if not prompts:
        raise ValueError("No matching source-code files changed in the latest commit.")

    fnames = [_extract_filename(p) for p in prompts]

    # Step 2 β€” Force sequential execution through Mellum 2 on GPU
    mellum_prompts = [_build_mellum_prompt(p) for p in prompts]
    summaries = _generate_sequential(mellum_prompts)

    file_results = [
        {"name": n, "summary": s}
        for n, s in zip(fnames, summaries)
    ]
    log.info("Sequential per-file summaries done")

    # Step 3 β€” Send the 2 .md summaries to Groq for final summary generation
    final_report = _generate_final_report_groq(file_results)

    log.info("Pipeline complete β€” processed top %d files", len(file_results))
    return {"files": file_results, "report": final_report}

# ---------------------------------------------------------------------------
# Boot
# ---------------------------------------------------------------------------

load_model_on_startup()   # weights land in CPU RAM; GPU untouched until first request

if __name__ == "__main__":
    log.info("Starting CommitLens ...")
    app.launch()