Deepanshu3012 commited on
Commit
fc2d789
Β·
1 Parent(s): 43a8680

Added models , utils , data and there codes

Browse files
data/__init__.py ADDED
File without changes
models/__init__.py ADDED
File without changes
models/code_analyzer.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Code Analyzer using CodeBERT and CodeT5.
3
+ - CodeBERT : embeddings + code quality classification
4
+ - CodeT5 : docstring / comment generation
5
+ """
6
+
7
+ from __future__ import annotations
8
+ import re
9
+ from dataclasses import dataclass, field
10
+ from typing import Optional
11
+
12
+ import torch
13
+ from transformers import (
14
+ AutoTokenizer,
15
+ AutoModel,
16
+ RobertaTokenizer,
17
+ T5ForConditionalGeneration,
18
+ )
19
+
20
+
21
+ # ── Data classes ─────────────────────────────────────────────────────────
22
+
23
+ @dataclass
24
+ class CodeQualityResult:
25
+ overall_score: float
26
+ complexity_score: float
27
+ documentation_score: float
28
+ naming_score: float
29
+ issues: list[str] = field(default_factory=list)
30
+ suggestions: list[str] = field(default_factory=list)
31
+ generated_docstring: str = ""
32
+ embedding: Optional[list] = None
33
+
34
+
35
+ # ── Heuristic helpers ─────────────────────────────────────────────────────
36
+
37
+ def _has_docstrings(code: str) -> bool:
38
+ return '"""' in code or "'''" in code
39
+
40
+ def _count_comments(code: str) -> int:
41
+ return len([l for l in code.splitlines() if l.strip().startswith("#")])
42
+
43
+ def _avg_name_length(code: str) -> float:
44
+ names = re.findall(r'\b([a-zA-Z_]\w*)\s*(?:\(|=)', code)
45
+ meaningful = [n for n in names if n not in {
46
+ "if", "else", "for", "while", "def", "class",
47
+ "return", "import", "from", "True", "False", "None",
48
+ }]
49
+ if not meaningful:
50
+ return 5.0
51
+ return sum(len(n) for n in meaningful) / len(meaningful)
52
+
53
+ def _detect_issues(code: str) -> list[str]:
54
+ issues = []
55
+ lines = code.splitlines()
56
+
57
+ long = [i + 1 for i, l in enumerate(lines) if len(l) > 79]
58
+ if long:
59
+ issues.append(f"Lines exceeding PEP-8 limit (79 chars): {long[:5]}")
60
+
61
+ if re.search(r'(?<![.\w])\d{2,}(?![\w.])', code):
62
+ issues.append("Magic numbers detected β€” use named constants")
63
+
64
+ if re.search(r'except\s*:', code):
65
+ issues.append("Bare `except:` clause β€” catch specific exceptions")
66
+
67
+ if re.search(r'^global\s+\w+', code, re.MULTILINE):
68
+ issues.append("Use of `global` β€” consider refactoring")
69
+
70
+ defs = re.findall(r'def\s+\w+\(([^)]*)\)', code)
71
+ if [d for d in defs if d and ':' not in d]:
72
+ issues.append("Function parameters missing type hints")
73
+
74
+ if re.search(r'#\s*(TODO|FIXME|HACK)', code, re.IGNORECASE):
75
+ issues.append("TODO/FIXME comments found β€” resolve before production")
76
+
77
+ return issues
78
+
79
+ def _score_documentation(code: str) -> float:
80
+ score = 40.0
81
+ if _has_docstrings(code):
82
+ score += 40.0
83
+ comment_density = _count_comments(code) / max(len(code.splitlines()), 1)
84
+ score += min(comment_density * 200, 20.0)
85
+ return min(score, 100.0)
86
+
87
+ def _score_naming(code: str) -> float:
88
+ avg = _avg_name_length(code)
89
+ if avg < 2: return 30.0
90
+ if avg < 4: return 55.0
91
+ if avg <= 20: return 85.0 + min((avg - 4) * 1.5, 15.0)
92
+ return 60.0
93
+
94
+ def _score_complexity(code: str) -> float:
95
+ lines = code.splitlines()
96
+ branches = sum(
97
+ 1 for l in lines
98
+ if re.search(r'\b(if|elif|for|while|try|except|with)\b', l)
99
+ )
100
+ nesting = max(
101
+ (len(l) - len(l.lstrip())) // 4 for l in lines if l.strip()
102
+ ) if lines else 0
103
+ penalty = branches * 3 + nesting * 5
104
+ return max(100 - penalty, 10.0)
105
+
106
+
107
+ # ── Main analyzer class ───────────────────────────────────────────────────
108
+
109
+ class CodeReviewAnalyzer:
110
+ CODEBERT_MODEL = "microsoft/codebert-base"
111
+ CODET5_MODEL = "Salesforce/codet5-base-codexglue-sum-python"
112
+
113
+ def __init__(self, use_gpu: bool = False):
114
+ self.device = torch.device(
115
+ "cuda" if use_gpu and torch.cuda.is_available() else "cpu"
116
+ )
117
+ self._bert_tokenizer = None
118
+ self._bert_model = None
119
+ self._t5_tokenizer = None
120
+ self._t5_model = None
121
+
122
+ def _load_codebert(self):
123
+ if self._bert_model is None:
124
+ print("Loading CodeBERT ...")
125
+ self._bert_tokenizer = AutoTokenizer.from_pretrained(self.CODEBERT_MODEL)
126
+ self._bert_model = AutoModel.from_pretrained(self.CODEBERT_MODEL)
127
+ self._bert_model.to(self.device).eval()
128
+
129
+ def _load_codet5(self):
130
+ if self._t5_model is None:
131
+ print("Loading CodeT5 ...")
132
+ self._t5_tokenizer = RobertaTokenizer.from_pretrained(self.CODET5_MODEL)
133
+ self._t5_model = T5ForConditionalGeneration.from_pretrained(
134
+ self.CODET5_MODEL
135
+ )
136
+ self._t5_model.to(self.device).eval()
137
+
138
+ def get_embedding(self, code: str) -> list[float]:
139
+ self._load_codebert()
140
+ tokens = self._bert_tokenizer(
141
+ code,
142
+ return_tensors="pt",
143
+ max_length=512,
144
+ truncation=True,
145
+ padding=True,
146
+ )
147
+ tokens = {k: v.to(self.device) for k, v in tokens.items()}
148
+ with torch.no_grad():
149
+ out = self._bert_model(**tokens)
150
+ return out.last_hidden_state.mean(dim=1).squeeze().tolist()
151
+
152
+ def generate_docstring(self, code: str) -> str:
153
+ self._load_codet5()
154
+ inputs = self._t5_tokenizer(
155
+ code,
156
+ return_tensors="pt",
157
+ max_length=512,
158
+ truncation=True,
159
+ ).to(self.device)
160
+ with torch.no_grad():
161
+ outputs = self._t5_model.generate(
162
+ **inputs,
163
+ max_new_tokens=128,
164
+ num_beams=4,
165
+ early_stopping=True,
166
+ )
167
+ raw = self._t5_tokenizer.decode(outputs[0], skip_special_tokens=True)
168
+ return f'"""\n{raw.strip()}\n"""'
169
+
170
+ def analyze(
171
+ self,
172
+ code: str,
173
+ language: str = "python",
174
+ generate_doc: bool = True,
175
+ get_embedding: bool = False,
176
+ ) -> CodeQualityResult:
177
+ issues = _detect_issues(code)
178
+ doc_score = _score_documentation(code)
179
+ name_score = _score_naming(code)
180
+ comp_score = _score_complexity(code)
181
+ overall = (doc_score * 0.35 + name_score * 0.30 + comp_score * 0.35)
182
+
183
+ suggestions = []
184
+ if doc_score < 60: suggestions.append("Add docstrings to all public functions")
185
+ if name_score < 60: suggestions.append("Use descriptive variable names (4+ chars)")
186
+ if comp_score < 50: suggestions.append("Reduce nesting β€” aim for complexity <= 10")
187
+ suggestions.append("Run `black` for formatting and `flake8` for linting")
188
+
189
+ docstring = ""
190
+ if generate_doc:
191
+ try:
192
+ docstring = self.generate_docstring(code)
193
+ except Exception as exc:
194
+ docstring = f"# Could not generate: {exc}"
195
+
196
+ embedding = None
197
+ if get_embedding:
198
+ try:
199
+ embedding = self.get_embedding(code)
200
+ except Exception:
201
+ pass
202
+
203
+ return CodeQualityResult(
204
+ overall_score = round(overall, 1),
205
+ complexity_score = round(comp_score, 1),
206
+ documentation_score = round(doc_score, 1),
207
+ naming_score = round(name_score, 1),
208
+ issues = issues,
209
+ suggestions = suggestions,
210
+ generated_docstring = docstring,
211
+ embedding = embedding,
212
+ )
utils/__init__.py ADDED
File without changes
utils/helpers.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility helpers for the Code Review NLP Assistant.
3
+ """
4
+
5
+ from __future__ import annotations
6
+ import ast
7
+ import re
8
+ from typing import Any
9
+
10
+
11
+ # ── Code parsing helpers ──────────────────────────────────────────────────
12
+
13
+ def extract_functions(code: str) -> list[dict[str, Any]]:
14
+ """
15
+ Parse Python source and return metadata for each function/method.
16
+ Returns a list of dicts with keys:
17
+ name, args, returns, has_docstring, lineno, end_lineno, source
18
+ """
19
+ results = []
20
+ try:
21
+ tree = ast.parse(code)
22
+ except SyntaxError:
23
+ return results
24
+
25
+ lines = code.splitlines()
26
+
27
+ for node in ast.walk(tree):
28
+ if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
29
+ continue
30
+
31
+ # Argument names
32
+ args = [a.arg for a in node.args.args]
33
+
34
+ # Return annotation
35
+ returns = ""
36
+ if node.returns:
37
+ try:
38
+ returns = ast.unparse(node.returns)
39
+ except Exception:
40
+ returns = "?"
41
+
42
+ # Docstring check
43
+ has_doc = (
44
+ isinstance(node.body[0], ast.Expr)
45
+ and isinstance(node.body[0].value, ast.Constant)
46
+ and isinstance(node.body[0].value.value, str)
47
+ ) if node.body else False
48
+
49
+ end = getattr(node, "end_lineno", node.lineno)
50
+ src_lines = lines[node.lineno - 1 : end]
51
+ source = "\n".join(src_lines)
52
+
53
+ results.append({
54
+ "name": node.name,
55
+ "args": args,
56
+ "returns": returns,
57
+ "has_docstring": has_doc,
58
+ "lineno": node.lineno,
59
+ "end_lineno": end,
60
+ "source": source,
61
+ })
62
+
63
+ return results
64
+
65
+
66
+ def extract_classes(code: str) -> list[dict[str, Any]]:
67
+ """Return a list of class metadata dicts."""
68
+ results = []
69
+ try:
70
+ tree = ast.parse(code)
71
+ except SyntaxError:
72
+ return results
73
+
74
+ for node in ast.walk(tree):
75
+ if not isinstance(node, ast.ClassDef):
76
+ continue
77
+ methods = [
78
+ n.name for n in ast.walk(node)
79
+ if isinstance(n, (ast.FunctionDef, ast.AsyncFunctionDef))
80
+ ]
81
+ has_doc = (
82
+ isinstance(node.body[0], ast.Expr)
83
+ and isinstance(node.body[0].value, ast.Constant)
84
+ ) if node.body else False
85
+
86
+ results.append({
87
+ "name": node.name,
88
+ "methods": methods,
89
+ "has_docstring": has_doc,
90
+ "lineno": node.lineno,
91
+ })
92
+
93
+ return results
94
+
95
+
96
+ def detect_language(code: str) -> str:
97
+ """Very simple language heuristic."""
98
+ if re.search(r'\bdef\b.*:\s*$', code, re.MULTILINE):
99
+ return "python"
100
+ if re.search(r'\bfunction\b|\bconst\b|\blet\b|\bvar\b', code):
101
+ return "javascript"
102
+ if re.search(r'\bpublic\b.*\bclass\b', code):
103
+ return "java"
104
+ return "unknown"
105
+
106
+
107
+ # ── Reporting helpers ─────────────────────────────────────────────────────
108
+
109
+ GRADE_MAP = [
110
+ (90, "A", "Excellent"),
111
+ (75, "B", "Good"),
112
+ (60, "C", "Needs work"),
113
+ (40, "D", "Poor"),
114
+ (0, "F", "Critical"),
115
+ ]
116
+
117
+
118
+ def score_to_grade(score: float) -> tuple[str, str]:
119
+ """Return (letter_grade, label) for a 0-100 score."""
120
+ for threshold, letter, label in GRADE_MAP:
121
+ if score >= threshold:
122
+ return letter, label
123
+ return "F", "Critical"
124
+
125
+
126
+ def score_color(score: float) -> str:
127
+ """Return a hex colour representing the score quality."""
128
+ if score >= 80: return "#22c55e"
129
+ if score >= 60: return "#f59e0b"
130
+ if score >= 40: return "#f97316"
131
+ return "#ef4444"
132
+
133
+
134
+ def build_report(result: Any, filename: str = "code_review") -> str:
135
+ """Generate a Markdown report from a CodeQualityResult."""
136
+ grade, label = score_to_grade(result.overall_score)
137
+ lines = [
138
+ f"# Code Review Report β€” `{filename}`\n",
139
+ f"## Overall Score: {result.overall_score}/100 ({grade} β€” {label})\n",
140
+ "### Sub-scores\n",
141
+ "| Metric | Score |",
142
+ "|--------|-------|",
143
+ f"| Documentation | {result.documentation_score}/100 |",
144
+ f"| Naming Quality | {result.naming_score}/100 |",
145
+ f"| Complexity | {result.complexity_score}/100 |",
146
+ "",
147
+ ]
148
+
149
+ if result.issues:
150
+ lines.append("### Issues Found\n")
151
+ for issue in result.issues:
152
+ lines.append(f"- {issue}")
153
+ lines.append("")
154
+
155
+ if result.suggestions:
156
+ lines.append("### Suggestions\n")
157
+ for s in result.suggestions:
158
+ lines.append(f"- {s}")
159
+ lines.append("")
160
+
161
+ if result.generated_docstring:
162
+ lines.append("### Generated Docstring (CodeT5)\n")
163
+ lines.append("```python")
164
+ lines.append(result.generated_docstring)
165
+ lines.append("```\n")
166
+
167
+ lines.append("---")
168
+ lines.append("*Generated by Code Review NLP Assistant using CodeBERT + CodeT5*")
169
+
170
+ return "\n".join(lines)