Spaces:

Deepanshu3012
/

CodeReviewAssistant

Sleeping

App Files Files Community

Deepanshu3012 commited on 17 days ago

Commit

fc2d789

1 Parent(s): 43a8680

Added models , utils , data and there codes

Browse files

Files changed (5) hide show

data/__init__.py +0 -0
models/__init__.py +0 -0
models/code_analyzer.py +212 -0
utils/__init__.py +0 -0
utils/helpers.py +170 -0

data/__init__.py ADDED Viewed

File without changes

models/__init__.py ADDED Viewed

File without changes

models/code_analyzer.py ADDED Viewed

	@@ -0,0 +1,212 @@

+"""
+Code Analyzer using CodeBERT and CodeT5.
+- CodeBERT  : embeddings + code quality classification
+- CodeT5    : docstring / comment generation
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from typing import Optional
+import torch
+from transformers import (
+    AutoTokenizer,
+    AutoModel,
+    RobertaTokenizer,
+    T5ForConditionalGeneration,
+)
+# ── Data classes ─────────────────────────────────────────────────────────
+@dataclass
+class CodeQualityResult:
+    overall_score:       float
+    complexity_score:    float
+    documentation_score: float
+    naming_score:        float
+    issues:              list[str] = field(default_factory=list)
+    suggestions:         list[str] = field(default_factory=list)
+    generated_docstring: str       = ""
+    embedding:           Optional[list] = None
+# ── Heuristic helpers ─────────────────────────────────────────────────────
+def _has_docstrings(code: str) -> bool:
+    return '"""' in code or "'''" in code
+def _count_comments(code: str) -> int:
+    return len([l for l in code.splitlines() if l.strip().startswith("#")])
+def _avg_name_length(code: str) -> float:
+    names = re.findall(r'\b([a-zA-Z_]\w*)\s*(?:\(|=)', code)
+    meaningful = [n for n in names if n not in {
+        "if", "else", "for", "while", "def", "class",
+        "return", "import", "from", "True", "False", "None",
+    }]
+    if not meaningful:
+        return 5.0
+    return sum(len(n) for n in meaningful) / len(meaningful)
+def _detect_issues(code: str) -> list[str]:
+    issues = []
+    lines = code.splitlines()
+    long = [i + 1 for i, l in enumerate(lines) if len(l) > 79]
+    if long:
+        issues.append(f"Lines exceeding PEP-8 limit (79 chars): {long[:5]}")
+    if re.search(r'(?<![.\w])\d{2,}(?![\w.])', code):
+        issues.append("Magic numbers detected — use named constants")
+    if re.search(r'except\s*:', code):
+        issues.append("Bare `except:` clause — catch specific exceptions")
+    if re.search(r'^global\s+\w+', code, re.MULTILINE):
+        issues.append("Use of `global` — consider refactoring")
+    defs = re.findall(r'def\s+\w+\(([^)]*)\)', code)
+    if [d for d in defs if d and ':' not in d]:
+        issues.append("Function parameters missing type hints")
+    if re.search(r'#\s*(TODO|FIXME|HACK)', code, re.IGNORECASE):
+        issues.append("TODO/FIXME comments found — resolve before production")
+    return issues
+def _score_documentation(code: str) -> float:
+    score = 40.0
+    if _has_docstrings(code):
+        score += 40.0
+    comment_density = _count_comments(code) / max(len(code.splitlines()), 1)
+    score += min(comment_density * 200, 20.0)
+    return min(score, 100.0)
+def _score_naming(code: str) -> float:
+    avg = _avg_name_length(code)
+    if avg < 2:   return 30.0
+    if avg < 4:   return 55.0
+    if avg <= 20: return 85.0 + min((avg - 4) * 1.5, 15.0)
+    return 60.0
+def _score_complexity(code: str) -> float:
+    lines = code.splitlines()
+    branches = sum(
+        1 for l in lines
+        if re.search(r'\b(if|elif|for|while|try|except|with)\b', l)
+    )
+    nesting = max(
+        (len(l) - len(l.lstrip())) // 4 for l in lines if l.strip()
+    ) if lines else 0
+    penalty = branches * 3 + nesting * 5
+    return max(100 - penalty, 10.0)
+# ── Main analyzer class ───────────────────────────────────────────────────
+class CodeReviewAnalyzer:
+    CODEBERT_MODEL = "microsoft/codebert-base"
+    CODET5_MODEL   = "Salesforce/codet5-base-codexglue-sum-python"
+    def __init__(self, use_gpu: bool = False):
+        self.device = torch.device(
+            "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
+        )
+        self._bert_tokenizer = None
+        self._bert_model     = None
+        self._t5_tokenizer   = None
+        self._t5_model       = None
+    def _load_codebert(self):
+        if self._bert_model is None:
+            print("Loading CodeBERT ...")
+            self._bert_tokenizer = AutoTokenizer.from_pretrained(self.CODEBERT_MODEL)
+            self._bert_model     = AutoModel.from_pretrained(self.CODEBERT_MODEL)
+            self._bert_model.to(self.device).eval()
+    def _load_codet5(self):
+        if self._t5_model is None:
+            print("Loading CodeT5 ...")
+            self._t5_tokenizer = RobertaTokenizer.from_pretrained(self.CODET5_MODEL)
+            self._t5_model     = T5ForConditionalGeneration.from_pretrained(
+                self.CODET5_MODEL
+            )
+            self._t5_model.to(self.device).eval()
+    def get_embedding(self, code: str) -> list[float]:
+        self._load_codebert()
+        tokens = self._bert_tokenizer(
+            code,
+            return_tensors="pt",
+            max_length=512,
+            truncation=True,
+            padding=True,
+        )
+        tokens = {k: v.to(self.device) for k, v in tokens.items()}
+        with torch.no_grad():
+            out = self._bert_model(**tokens)
+        return out.last_hidden_state.mean(dim=1).squeeze().tolist()
+    def generate_docstring(self, code: str) -> str:
+        self._load_codet5()
+        inputs = self._t5_tokenizer(
+            code,
+            return_tensors="pt",
+            max_length=512,
+            truncation=True,
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self._t5_model.generate(
+                **inputs,
+                max_new_tokens=128,
+                num_beams=4,
+                early_stopping=True,
+            )
+        raw = self._t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
+        return f'"""\n{raw.strip()}\n"""'
+    def analyze(
+        self,
+        code: str,
+        language: str = "python",
+        generate_doc: bool = True,
+        get_embedding: bool = False,
+    ) -> CodeQualityResult:
+        issues     = _detect_issues(code)
+        doc_score  = _score_documentation(code)
+        name_score = _score_naming(code)
+        comp_score = _score_complexity(code)
+        overall    = (doc_score * 0.35 + name_score * 0.30 + comp_score * 0.35)
+        suggestions = []
+        if doc_score  < 60: suggestions.append("Add docstrings to all public functions")
+        if name_score < 60: suggestions.append("Use descriptive variable names (4+ chars)")
+        if comp_score < 50: suggestions.append("Reduce nesting — aim for complexity <= 10")
+        suggestions.append("Run `black` for formatting and `flake8` for linting")
+        docstring = ""
+        if generate_doc:
+            try:
+                docstring = self.generate_docstring(code)
+            except Exception as exc:
+                docstring = f"# Could not generate: {exc}"
+        embedding = None
+        if get_embedding:
+            try:
+                embedding = self.get_embedding(code)
+            except Exception:
+                pass
+        return CodeQualityResult(
+            overall_score       = round(overall, 1),
+            complexity_score    = round(comp_score, 1),
+            documentation_score = round(doc_score, 1),
+            naming_score        = round(name_score, 1),
+            issues              = issues,
+            suggestions         = suggestions,
+            generated_docstring = docstring,
+            embedding           = embedding,
+        )

utils/__init__.py ADDED Viewed

File without changes

utils/helpers.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""
+Utility helpers for the Code Review NLP Assistant.
+"""
+from __future__ import annotations
+import ast
+import re
+from typing import Any
+# ── Code parsing helpers ──────────────────────────────────────────────────
+def extract_functions(code: str) -> list[dict[str, Any]]:
+    """
+    Parse Python source and return metadata for each function/method.
+    Returns a list of dicts with keys:
+        name, args, returns, has_docstring, lineno, end_lineno, source
+    """
+    results = []
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return results
+    lines = code.splitlines()
+    for node in ast.walk(tree):
+        if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
+            continue
+        # Argument names
+        args = [a.arg for a in node.args.args]
+        # Return annotation
+        returns = ""
+        if node.returns:
+            try:
+                returns = ast.unparse(node.returns)
+            except Exception:
+                returns = "?"
+        # Docstring check
+        has_doc = (
+            isinstance(node.body[0], ast.Expr)
+            and isinstance(node.body[0].value, ast.Constant)
+            and isinstance(node.body[0].value.value, str)
+        ) if node.body else False
+        end = getattr(node, "end_lineno", node.lineno)
+        src_lines = lines[node.lineno - 1 : end]
+        source = "\n".join(src_lines)
+        results.append({
+            "name":          node.name,
+            "args":          args,
+            "returns":       returns,
+            "has_docstring": has_doc,
+            "lineno":        node.lineno,
+            "end_lineno":    end,
+            "source":        source,
+        })
+    return results
+def extract_classes(code: str) -> list[dict[str, Any]]:
+    """Return a list of class metadata dicts."""
+    results = []
+    try:
+        tree = ast.parse(code)
+    except SyntaxError:
+        return results
+    for node in ast.walk(tree):
+        if not isinstance(node, ast.ClassDef):
+            continue
+        methods = [
+            n.name for n in ast.walk(node)
+            if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
+        ]
+        has_doc = (
+            isinstance(node.body[0], ast.Expr)
+            and isinstance(node.body[0].value, ast.Constant)
+        ) if node.body else False
+        results.append({
+            "name":          node.name,
+            "methods":       methods,
+            "has_docstring": has_doc,
+            "lineno":        node.lineno,
+        })
+    return results
+def detect_language(code: str) -> str:
+    """Very simple language heuristic."""
+    if re.search(r'\bdef\b.*:\s*$', code, re.MULTILINE):
+        return "python"
+    if re.search(r'\bfunction\b|\bconst\b|\blet\b|\bvar\b', code):
+        return "javascript"
+    if re.search(r'\bpublic\b.*\bclass\b', code):
+        return "java"
+    return "unknown"
+# ── Reporting helpers ─────────────────────────────────────────────────────
+GRADE_MAP = [
+    (90, "A", "Excellent"),
+    (75, "B", "Good"),
+    (60, "C", "Needs work"),
+    (40, "D", "Poor"),
+    (0,  "F", "Critical"),
+]
+def score_to_grade(score: float) -> tuple[str, str]:
+    """Return (letter_grade, label) for a 0-100 score."""
+    for threshold, letter, label in GRADE_MAP:
+        if score >= threshold:
+            return letter, label
+    return "F", "Critical"
+def score_color(score: float) -> str:
+    """Return a hex colour representing the score quality."""
+    if score >= 80: return "#22c55e"
+    if score >= 60: return "#f59e0b"
+    if score >= 40: return "#f97316"
+    return "#ef4444"
+def build_report(result: Any, filename: str = "code_review") -> str:
+    """Generate a Markdown report from a CodeQualityResult."""
+    grade, label = score_to_grade(result.overall_score)
+    lines = [
+        f"# Code Review Report — `{filename}`\n",
+        f"## Overall Score: {result.overall_score}/100 ({grade} — {label})\n",
+        "### Sub-scores\n",
+        "| Metric | Score |",
+        "|--------|-------|",
+        f"| Documentation | {result.documentation_score}/100 |",
+        f"| Naming Quality | {result.naming_score}/100 |",
+        f"| Complexity | {result.complexity_score}/100 |",
+        "",
+    ]
+    if result.issues:
+        lines.append("### Issues Found\n")
+        for issue in result.issues:
+            lines.append(f"- {issue}")
+        lines.append("")
+    if result.suggestions:
+        lines.append("### Suggestions\n")
+        for s in result.suggestions:
+            lines.append(f"- {s}")
+        lines.append("")
+    if result.generated_docstring:
+        lines.append("### Generated Docstring (CodeT5)\n")
+        lines.append("```python")
+        lines.append(result.generated_docstring)
+        lines.append("```\n")
+    lines.append("---")
+    lines.append("*Generated by Code Review NLP Assistant using CodeBERT + CodeT5*")
+    return "\n".join(lines)