download
raw
11.1 kB
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""Technical debt indicator detection.
Scans code for common debt markers like TODO comments, large files,
long functions, and code smells.
"""
import pathlib
import re
from dataclasses import dataclass
from typing import Dict, List, Optional
from .file_discovery import get_tracked_source_files
@dataclass
class DebtMarker:
"""A technical debt marker found in code."""
file: str
line: int
marker_type: str # TODO, FIXME, HACK, XXX, etc.
text: str
def to_dict(self) -> dict:
return {
"file": self.file,
"line": self.line,
"type": self.marker_type,
"text": self.text[:200], # Truncate long comments
}
@dataclass
class LargeFile:
"""A file that exceeds size thresholds."""
path: str
total_lines: int
code_lines: int
def to_dict(self) -> dict:
return {
"path": self.path,
"total_lines": self.total_lines,
"code_lines": self.code_lines,
}
@dataclass
class LongFunction:
"""A function that exceeds length thresholds."""
file: str
function_name: str
line: int
length: int
def to_dict(self) -> dict:
return {
"file": self.file,
"function": self.function_name,
"line": self.line,
"length": self.length,
}
# Debt marker patterns
DEBT_PATTERNS = [
(r"#\s*(TODO|FIXME|HACK|XXX|BUG|REFACTOR|OPTIMIZE|REVIEW)[:\s](.*)$", "python"),
(
r"//\s*(TODO|FIXME|HACK|XXX|BUG|REFACTOR|OPTIMIZE|REVIEW)[:\s](.*)$",
"typescript",
),
(
r"/\*\s*(TODO|FIXME|HACK|XXX|BUG|REFACTOR|OPTIMIZE|REVIEW)[:\s](.*?)\*/",
"typescript",
),
]
# Thresholds (configurable)
LARGE_FILE_THRESHOLD = 500 # lines of code
LONG_FUNCTION_THRESHOLD = 50 # lines
def find_debt_markers(
filepath: pathlib.Path, repo_root: pathlib.Path
) -> List[DebtMarker]:
"""Find TODO/FIXME/HACK markers in a file."""
try:
content = filepath.read_text(encoding="utf-8")
except (UnicodeDecodeError, OSError):
return []
rel_path = filepath.relative_to(repo_root).as_posix()
markers = []
# Determine file type
suffix = filepath.suffix.lower()
if suffix == ".py":
file_type = "python"
elif suffix in {".ts", ".js", ".tsx", ".jsx"}:
file_type = "typescript"
else:
return []
for pattern, pattern_type in DEBT_PATTERNS:
if pattern_type != file_type:
continue
for line_num, line in enumerate(content.splitlines(), start=1):
match = re.search(pattern, line, re.IGNORECASE)
if match:
marker_type = match.group(1).upper()
text = match.group(2).strip() if match.lastindex >= 2 else ""
markers.append(
DebtMarker(
file=rel_path,
line=line_num,
marker_type=marker_type,
text=text,
)
)
return markers
def analyze_file_size(
filepath: pathlib.Path, repo_root: pathlib.Path
) -> Optional[LargeFile]:
"""Check if a file exceeds size thresholds."""
try:
content = filepath.read_text(encoding="utf-8")
except (UnicodeDecodeError, OSError):
return None
lines = content.splitlines()
total_lines = len(lines)
# Count code lines (non-empty, non-comment)
suffix = filepath.suffix.lower()
if suffix == ".py":
code_lines = sum(
1 for line in lines if line.strip() and not line.strip().startswith("#")
)
elif suffix in {".ts", ".js", ".tsx", ".jsx"}:
code_lines = sum(
1 for line in lines if line.strip() and not line.strip().startswith("//")
)
else:
code_lines = total_lines
if code_lines > LARGE_FILE_THRESHOLD:
rel_path = filepath.relative_to(repo_root).as_posix()
return LargeFile(path=rel_path, total_lines=total_lines, code_lines=code_lines)
return None
def find_long_functions_python(
filepath: pathlib.Path, repo_root: pathlib.Path
) -> List[LongFunction]:
"""Find Python functions that exceed length threshold."""
try:
content = filepath.read_text(encoding="utf-8")
except (UnicodeDecodeError, OSError):
return []
rel_path = filepath.relative_to(repo_root).as_posix()
lines = content.splitlines()
long_funcs = []
# Simple pattern to find function definitions
func_pattern = re.compile(r"^(\s*)(?:async\s+)?def\s+(\w+)\s*\(")
current_func = None
current_indent = 0
func_start = 0
for i, line in enumerate(lines):
match = func_pattern.match(line)
if match:
# Check previous function
if current_func:
length = i - func_start
if length > LONG_FUNCTION_THRESHOLD:
long_funcs.append(
LongFunction(
file=rel_path,
function_name=current_func,
line=func_start + 1,
length=length,
)
)
current_indent = len(match.group(1))
current_func = match.group(2)
func_start = i
# Detect when we've left the current function (dedent to same or less level)
elif current_func and line.strip():
line_indent = len(line) - len(line.lstrip())
if line_indent <= current_indent and not line.strip().startswith("#"):
# End of function
length = i - func_start
if length > LONG_FUNCTION_THRESHOLD:
long_funcs.append(
LongFunction(
file=rel_path,
function_name=current_func,
line=func_start + 1,
length=length,
)
)
current_func = None
# Check last function
if current_func:
length = len(lines) - func_start
if length > LONG_FUNCTION_THRESHOLD:
long_funcs.append(
LongFunction(
file=rel_path,
function_name=current_func,
line=func_start + 1,
length=length,
)
)
return long_funcs
def find_long_functions_typescript(
filepath: pathlib.Path, repo_root: pathlib.Path
) -> List[LongFunction]:
"""Find TypeScript functions that exceed length threshold."""
try:
content = filepath.read_text(encoding="utf-8")
except (UnicodeDecodeError, OSError):
return []
rel_path = filepath.relative_to(repo_root).as_posix()
lines = content.splitlines()
long_funcs = []
# Simplified pattern for function definitions
func_pattern = re.compile(
r"^\s*(?:export\s+)?(?:async\s+)?(?:function\s+(\w+)|(\w+)\s*(?:<[^>]*>)?\s*\([^)]*\)\s*(?::\s*[^{]+)?\s*\{)"
)
i = 0
while i < len(lines):
match = func_pattern.match(lines[i])
if match:
func_name = match.group(1) or match.group(2) or "anonymous"
func_start = i
# Count braces to find function end
brace_count = 0
found_open = False
for j in range(i, len(lines)):
line = lines[j]
for char in line:
if char == "{":
brace_count += 1
found_open = True
elif char == "}":
brace_count -= 1
if found_open and brace_count == 0:
length = j - func_start + 1
if length > LONG_FUNCTION_THRESHOLD:
long_funcs.append(
LongFunction(
file=rel_path,
function_name=func_name,
line=func_start + 1,
length=length,
)
)
i = j
break
else:
# Reached end without closing brace
break
i += 1
return long_funcs
def analyze_debt(repo_root: pathlib.Path) -> dict:
"""Run complete debt indicator analysis.
Returns:
Dictionary with all debt indicators
"""
all_markers: List[DebtMarker] = []
large_files: List[LargeFile] = []
long_functions: List[LongFunction] = []
# Find all source files using git ls-files (respects .gitignore)
extensions = [".py", ".ts", ".js", ".tsx", ".jsx"]
source_files = get_tracked_source_files(repo_root, extensions)
# Analyze each file
for filepath in source_files:
# Find debt markers
markers = find_debt_markers(filepath, repo_root)
all_markers.extend(markers)
# Check file size
large_file = analyze_file_size(filepath, repo_root)
if large_file:
large_files.append(large_file)
# Find long functions
suffix = filepath.suffix.lower()
if suffix == ".py":
long_funcs = find_long_functions_python(filepath, repo_root)
elif suffix in {".ts", ".js", ".tsx", ".jsx"}:
long_funcs = find_long_functions_typescript(filepath, repo_root)
else:
long_funcs = []
long_functions.extend(long_funcs)
# Group markers by type
markers_by_type: Dict[str, List[dict]] = {}
for marker in all_markers:
if marker.marker_type not in markers_by_type:
markers_by_type[marker.marker_type] = []
markers_by_type[marker.marker_type].append(marker.to_dict())
# Sort large files and long functions
large_files.sort(key=lambda f: f.code_lines, reverse=True)
long_functions.sort(key=lambda f: f.length, reverse=True)
return {
"debt_markers": {
"by_type": markers_by_type,
"total_count": len(all_markers),
"summary": {
marker_type: len(markers)
for marker_type, markers in markers_by_type.items()
},
},
"large_files": {
"files": [f.to_dict() for f in large_files],
"count": len(large_files),
"threshold": LARGE_FILE_THRESHOLD,
},
"long_functions": {
"functions": [f.to_dict() for f in long_functions[:50]], # Top 50
"count": len(long_functions),
"threshold": LONG_FUNCTION_THRESHOLD,
},
"files_analyzed": len(source_files),
}
if __name__ == "__main__":
import json
repo = pathlib.Path(__file__).parent.parent
result = analyze_debt(repo)
print(json.dumps(result, indent=2))

Xet Storage Details

Size:
11.1 kB
·
Xet hash:
4d232cb4d5d68ae871e9ace48327f878aa8efa35c5bcf8e6c3217cce48c38488

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.