CodeReviewAssistant / models /code_analyzer.py
Deepanshu3012's picture
Added models , utils , data and there codes
fc2d789
"""
Code Analyzer using CodeBERT and CodeT5.
- CodeBERT : embeddings + code quality classification
- CodeT5 : docstring / comment generation
"""
from __future__ import annotations
import re
from dataclasses import dataclass, field
from typing import Optional
import torch
from transformers import (
AutoTokenizer,
AutoModel,
RobertaTokenizer,
T5ForConditionalGeneration,
)
# ── Data classes ─────────────────────────────────────────────────────────
@dataclass
class CodeQualityResult:
overall_score: float
complexity_score: float
documentation_score: float
naming_score: float
issues: list[str] = field(default_factory=list)
suggestions: list[str] = field(default_factory=list)
generated_docstring: str = ""
embedding: Optional[list] = None
# ── Heuristic helpers ─────────────────────────────────────────────────────
def _has_docstrings(code: str) -> bool:
return '"""' in code or "'''" in code
def _count_comments(code: str) -> int:
return len([l for l in code.splitlines() if l.strip().startswith("#")])
def _avg_name_length(code: str) -> float:
names = re.findall(r'\b([a-zA-Z_]\w*)\s*(?:\(|=)', code)
meaningful = [n for n in names if n not in {
"if", "else", "for", "while", "def", "class",
"return", "import", "from", "True", "False", "None",
}]
if not meaningful:
return 5.0
return sum(len(n) for n in meaningful) / len(meaningful)
def _detect_issues(code: str) -> list[str]:
issues = []
lines = code.splitlines()
long = [i + 1 for i, l in enumerate(lines) if len(l) > 79]
if long:
issues.append(f"Lines exceeding PEP-8 limit (79 chars): {long[:5]}")
if re.search(r'(?<![.\w])\d{2,}(?![\w.])', code):
issues.append("Magic numbers detected β€” use named constants")
if re.search(r'except\s*:', code):
issues.append("Bare `except:` clause β€” catch specific exceptions")
if re.search(r'^global\s+\w+', code, re.MULTILINE):
issues.append("Use of `global` β€” consider refactoring")
defs = re.findall(r'def\s+\w+\(([^)]*)\)', code)
if [d for d in defs if d and ':' not in d]:
issues.append("Function parameters missing type hints")
if re.search(r'#\s*(TODO|FIXME|HACK)', code, re.IGNORECASE):
issues.append("TODO/FIXME comments found β€” resolve before production")
return issues
def _score_documentation(code: str) -> float:
score = 40.0
if _has_docstrings(code):
score += 40.0
comment_density = _count_comments(code) / max(len(code.splitlines()), 1)
score += min(comment_density * 200, 20.0)
return min(score, 100.0)
def _score_naming(code: str) -> float:
avg = _avg_name_length(code)
if avg < 2: return 30.0
if avg < 4: return 55.0
if avg <= 20: return 85.0 + min((avg - 4) * 1.5, 15.0)
return 60.0
def _score_complexity(code: str) -> float:
lines = code.splitlines()
branches = sum(
1 for l in lines
if re.search(r'\b(if|elif|for|while|try|except|with)\b', l)
)
nesting = max(
(len(l) - len(l.lstrip())) // 4 for l in lines if l.strip()
) if lines else 0
penalty = branches * 3 + nesting * 5
return max(100 - penalty, 10.0)
# ── Main analyzer class ───────────────────────────────────────────────────
class CodeReviewAnalyzer:
CODEBERT_MODEL = "microsoft/codebert-base"
CODET5_MODEL = "Salesforce/codet5-base-codexglue-sum-python"
def __init__(self, use_gpu: bool = False):
self.device = torch.device(
"cuda" if use_gpu and torch.cuda.is_available() else "cpu"
)
self._bert_tokenizer = None
self._bert_model = None
self._t5_tokenizer = None
self._t5_model = None
def _load_codebert(self):
if self._bert_model is None:
print("Loading CodeBERT ...")
self._bert_tokenizer = AutoTokenizer.from_pretrained(self.CODEBERT_MODEL)
self._bert_model = AutoModel.from_pretrained(self.CODEBERT_MODEL)
self._bert_model.to(self.device).eval()
def _load_codet5(self):
if self._t5_model is None:
print("Loading CodeT5 ...")
self._t5_tokenizer = RobertaTokenizer.from_pretrained(self.CODET5_MODEL)
self._t5_model = T5ForConditionalGeneration.from_pretrained(
self.CODET5_MODEL
)
self._t5_model.to(self.device).eval()
def get_embedding(self, code: str) -> list[float]:
self._load_codebert()
tokens = self._bert_tokenizer(
code,
return_tensors="pt",
max_length=512,
truncation=True,
padding=True,
)
tokens = {k: v.to(self.device) for k, v in tokens.items()}
with torch.no_grad():
out = self._bert_model(**tokens)
return out.last_hidden_state.mean(dim=1).squeeze().tolist()
def generate_docstring(self, code: str) -> str:
self._load_codet5()
inputs = self._t5_tokenizer(
code,
return_tensors="pt",
max_length=512,
truncation=True,
).to(self.device)
with torch.no_grad():
outputs = self._t5_model.generate(
**inputs,
max_new_tokens=128,
num_beams=4,
early_stopping=True,
)
raw = self._t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
return f'"""\n{raw.strip()}\n"""'
def analyze(
self,
code: str,
language: str = "python",
generate_doc: bool = True,
get_embedding: bool = False,
) -> CodeQualityResult:
issues = _detect_issues(code)
doc_score = _score_documentation(code)
name_score = _score_naming(code)
comp_score = _score_complexity(code)
overall = (doc_score * 0.35 + name_score * 0.30 + comp_score * 0.35)
suggestions = []
if doc_score < 60: suggestions.append("Add docstrings to all public functions")
if name_score < 60: suggestions.append("Use descriptive variable names (4+ chars)")
if comp_score < 50: suggestions.append("Reduce nesting β€” aim for complexity <= 10")
suggestions.append("Run `black` for formatting and `flake8` for linting")
docstring = ""
if generate_doc:
try:
docstring = self.generate_docstring(code)
except Exception as exc:
docstring = f"# Could not generate: {exc}"
embedding = None
if get_embedding:
try:
embedding = self.get_embedding(code)
except Exception:
pass
return CodeQualityResult(
overall_score = round(overall, 1),
complexity_score = round(comp_score, 1),
documentation_score = round(doc_score, 1),
naming_score = round(name_score, 1),
issues = issues,
suggestions = suggestions,
generated_docstring = docstring,
embedding = embedding,
)