Spaces:
Running on Zero
Running on Zero
File size: 10,926 Bytes
5a0d646 ee5892f cfdaeb1 5a0d646 1dc7db9 cfdaeb1 1dc7db9 ee5892f 1dc7db9 cfdaeb1 2d5d362 ee5892f cfdaeb1 2d5d362 5a0d646 1dc7db9 5a0d646 cfdaeb1 5a0d646 49fb46a cfdaeb1 634b44d 5a0d646 cfdaeb1 5a0d646 cfdaeb1 2133d08 634b44d 2133d08 cfdaeb1 2133d08 cfdaeb1 5a0d646 cfdaeb1 5a0d646 cfdaeb1 5a0d646 1d719f3 cfdaeb1 5a0d646 1d719f3 5a0d646 cfdaeb1 5a0d646 1d719f3 cfdaeb1 5a0d646 cfdaeb1 ee5892f cfdaeb1 2d5d362 5a0d646 cfdaeb1 582d969 cfdaeb1 49fb46a b726029 cfdaeb1 2d5d362 cfdaeb1 1d719f3 cfdaeb1 5a0d646 1d719f3 5a0d646 ee5892f 5a0d646 ee5892f eadc788 ee5892f cfdaeb1 eadc788 5a0d646 cfdaeb1 ee5892f cfdaeb1 eadc788 cfdaeb1 eadc788 ee5892f eadc788 5a0d646 cfdaeb1 5a0d646 eadc788 cfdaeb1 eadc788 cfdaeb1 eadc788 5a0d646 eadc788 cfdaeb1 5a0d646 eadc788 ee5892f 5a0d646 cfdaeb1 5a0d646 cfdaeb1 ee5892f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 | """
CommitLens β gradio.Server mode
================================
- Serves custom index.html at GET /
- Exposes process_repo via @app.api() for the JS frontend to call
- Mellum 2 (6-bit, CPU-resident) handles per-file summaries via batched GPU inference
- Groq llama-70b handles the final report (fast, no GPU cost)
- <think>...</think> blocks stripped from all Mellum outputs
- Per-file output is tightly constrained to 3-5 bullet points max
"""
from __future__ import annotations
import logging
import os
import re
import sys
from pathlib import Path
import spaces
import torch
from fastapi.responses import HTMLResponse
from gradio import Server
from groq import Groq
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from commitlens import run_pipeline
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
stream=sys.stdout,
)
log = logging.getLogger("commitlens")
# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
MODEL_REPO_ID = "JetBrains/Mellum2-12B-A2.5B-Instruct"
GROQ_MODEL = "llama-3.3-70b-versatile" # fast Groq-hosted 70B
# BATCH_TOKEN_BUDGET = 7000 # estimated input tokens; above this β sequential
# ---------------------------------------------------------------------------
# Prompts
# ---------------------------------------------------------------------------
# Tight, bullet-constrained prompt β short output β fewer tokens generated
SUMMARY_SYSTEM_PROMPT = """
You are a senior software engineer reviewing a git diff for ONE file.
Analyze the actual code changes and produce a concise technical review.
Output EXACTLY in this format:
Summary:
<2-4 sentences describing the code changes>
Reason:
<1 sentence explaining the reason if clearly evident from the diff, otherwise "Reason not evident from the diff.">
Observations:
* <observation>
* <observation>
* <observation>
Rules:
* Use ONLY information visible in the diff and provided code context.
* Refer to functions, classes, methods, imports, decorators, constants, configuration values, API calls, and control flow when relevant.
* Focus on what was actually modified, added, removed, or refactored.
* Mention risks, assumptions, limitations, edge cases, or behavioral changes when visible.
* Mention architectural or design changes when directly supported by the diff.
* Do NOT invent requirements, business goals, performance improvements, bug fixes, security improvements, or developer intent.
* If something cannot be proven from the diff, do not claim it.
* Avoid generic statements such as:
"improves reliability"
"improves scalability"
"improves performance"
unless explicitly supported by the code changes.
* Do not repeat the filename.
* No markdown headers beyond the required section names.
* No code fences.
* No chain-of-thought.
* No speculative reasoning.
* Target 80-180 words.
"""
FINAL_SYSTEM_PROMPT = """\
You are a technical writer producing a commit review report.
Given per-file summaries, write a structured markdown report with these exact sections:
## Commit Overview
One paragraph (3-5 sentences) summarising the overall intent of the commit.
## Changes Per File
A sub-section per file (### `filename`) with 2-4 bullet points.
## Key Takeaways
3-5 bullets: cross-cutting concerns, risks, follow-up actions.
Rules:
- Total report MUST be under 400 words
- No filler phrases ("In conclusion", "It is worth noting")
- Output markdown only β no preamble, no explanation
"""
# ---------------------------------------------------------------------------
# Global model state β CPU-resident between requests
# ---------------------------------------------------------------------------
_model: AutoModelForCausalLM | None = None
_tokenizer: AutoTokenizer | None = None
def _strip_thinking(text: str) -> str:
"""Remove <think>...</think> blocks (multiline) produced by thinking models."""
return re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
def _extract_filename(prompt: str) -> str:
for line in prompt.splitlines():
if line.startswith("Filename :"):
return line.split(":", 1)[1].strip()
return "unknown"
# ---------------------------------------------------------------------------
# Startup: load Mellum 2 in 6-bit NF4 into CPU RAM
# Runs ONCE before app.launch(), outside any @spaces.GPU context.
# ---------------------------------------------------------------------------
def load_model_on_startup() -> None:
"""
Load Mellum 2 into CPU RAM with 6-bit NF4 double quantization.
device_map='cpu' keeps weights off-GPU until a @spaces.GPU call fires,
satisfying ZeroGPU's requirement that GPU allocation only happens inside
decorated functions.
"""
global _model, _tokenizer
log.info("=== STARTUP: loading tokenizer (%s) ===", MODEL_REPO_ID)
_tokenizer = AutoTokenizer.from_pretrained(MODEL_REPO_ID)
if _tokenizer.pad_token_id is None:
_tokenizer.pad_token_id = _tokenizer.eos_token_id
log.info("Tokenizer ready. pad_token_id=%s", _tokenizer.pad_token_id)
log.info("=== STARTUP: loading model in 6-bit NF4 on CPU ===")
quant_cfg = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True, # NF4 + double quant β effective 6-bit
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
_model = AutoModelForCausalLM.from_pretrained(
MODEL_REPO_ID,
quantization_config=quant_cfg,
device_map="cpu",
torch_dtype=torch.bfloat16,
)
_model.eval()
log.info("=== STARTUP: model ready on CPU ===")
# ---------------------------------------------------------------------------
# Mellum inference (called inside @spaces.GPU)
# ---------------------------------------------------------------------------
def _build_mellum_prompt(user_content: str) -> str:
"""Apply Mellum's chat template to a single user turn."""
return _tokenizer.apply_chat_template(
[
{"role": "system", "content": SUMMARY_SYSTEM_PROMPT},
{"role": "user", "content": user_content},
],
tokenize=False,
add_generation_prompt=True,
)
def _generate_sequential(prompts: list[str]) -> list[str]:
"""Fallback single-prompt inference when batch would OOM."""
log.info("Sequential inference: %d prompts", len(prompts))
_tokenizer.padding_side = "right"
results = []
for i, prompt in enumerate(prompts):
log.info(" [%d/%d]", i + 1, len(prompts))
enc = _tokenizer(prompt, return_tensors="pt").to("cuda")
with torch.no_grad():
out = _model.generate(
**enc,
max_new_tokens=200,
use_cache=True,
do_sample=True,
temperature=0.4,
top_p=0.95,
pad_token_id=_tokenizer.pad_token_id,
)
text = _tokenizer.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True)
results.append(_strip_thinking(text))
return results
# ---------------------------------------------------------------------------
# Groq final report (pure API call β no GPU needed)
# ---------------------------------------------------------------------------
def _generate_final_report_groq(per_file_summaries: list[dict]) -> str:
"""
Send all per-file summaries to Groq llama-3.3-70b and get back
a structured markdown commit report. Fast (~2-4 s) and free of GPU cost.
Reads GROQ_API_KEY from environment (set as a HF Space secret).
"""
groq_client = Groq(api_key=os.environ["GROQ_API_KEY"])
# Format per-file summaries as a clean user message
user_content = "\n\n".join(
f"### `{f['name']}`\n{f['summary']}"
for f in per_file_summaries
)
log.info("Calling Groq %s for final report (%d files) ...", GROQ_MODEL, len(per_file_summaries))
response = groq_client.chat.completions.create(
model=GROQ_MODEL,
messages=[
{"role": "system", "content": FINAL_SYSTEM_PROMPT},
{"role": "user", "content": user_content},
],
max_tokens=600, # 400-word cap + small buffer
temperature=0.2, # low temp for consistent, factual output
)
report = response.choices[0].message.content.strip()
log.info("Groq report received (%d chars)", len(report))
return report
# ---------------------------------------------------------------------------
# gradio.Server app
# ---------------------------------------------------------------------------
app = Server()
@app.get("/", response_class=HTMLResponse)
async def homepage():
html_path = Path(__file__).parent / "index.html"
return HTMLResponse(content=html_path.read_text(encoding="utf-8"))
@app.api(name="process_repo")
@spaces.GPU(duration=240)
def process_repo(repo_url: str, token: str) -> dict:
"""
Full pipeline:
1. run_pipeline() β Top 2 most changed file prompts (CPU, fast)
2. Mellum 2 sequential β per-file summaries (.md format) (GPU, sequential)
3. Groq 70B β final markdown summary report (API, ~3 s)
Returns: { "files": [{"name": str, "summary": str}], "report": str }
"""
log.info("=== process_repo: %s ===", repo_url)
_model.to("cuda") # move model to GPU for Mellum inference
# Step 1 β fetch diff and build prompts (Now limited to top 2 files from commitlens.py)
prompts = run_pipeline(repo_url, token.strip() or None)
log.info("Got %d file prompts from pipeline (capped at top 2)", len(prompts))
if not prompts:
raise ValueError("No matching source-code files changed in the latest commit.")
fnames = [_extract_filename(p) for p in prompts]
# Step 2 β Force sequential execution through Mellum 2 on GPU
mellum_prompts = [_build_mellum_prompt(p) for p in prompts]
summaries = _generate_sequential(mellum_prompts)
file_results = [
{"name": n, "summary": s}
for n, s in zip(fnames, summaries)
]
log.info("Sequential per-file summaries done")
# Step 3 β Send the 2 .md summaries to Groq for final summary generation
final_report = _generate_final_report_groq(file_results)
log.info("Pipeline complete β processed top %d files", len(file_results))
return {"files": file_results, "report": final_report}
# ---------------------------------------------------------------------------
# Boot
# ---------------------------------------------------------------------------
load_model_on_startup() # weights land in CPU RAM; GPU untouched until first request
if __name__ == "__main__":
log.info("Starting CommitLens ...")
app.launch()
|