mr4's picture
Upload 136 files
fd8cdf5 verified
import * as fs from 'node:fs';
import * as path from 'node:path';
import { execSync } from 'node:child_process';
/**
* Default directories to exclude from scanning.
*/
export const DEFAULT_IGNORE_DIRS = [
'node_modules',
'.git',
'dist',
'build',
'vendor',
'__pycache__',
'.next',
'.cache',
'.turbo',
'target',
'obj',
'.understand-anything',
];
/**
* Default file patterns to exclude from scanning (binary, generated, lock files).
*/
export const DEFAULT_IGNORE_FILE_PATTERNS = [
'*.lock',
'*.min.js',
'*.min.css',
'*.map',
'*.png',
'*.jpg',
'*.jpeg',
'*.gif',
'*.svg',
'*.ico',
'*.woff',
'*.woff2',
'*.ttf',
'*.eot',
'*.mp3',
'*.mp4',
'*.webm',
'*.zip',
'*.tar',
'*.gz',
'*.pdf',
'*.exe',
'*.dll',
'*.so',
'*.dylib',
];
/**
* Extension-to-language mapping.
*/
const EXTENSION_LANGUAGE_MAP = {
'.ts': 'typescript',
'.tsx': 'typescript',
'.js': 'javascript',
'.jsx': 'javascript',
'.mjs': 'javascript',
'.cjs': 'javascript',
'.py': 'python',
'.go': 'go',
'.rs': 'rust',
'.java': 'java',
'.rb': 'ruby',
'.php': 'php',
'.cs': 'csharp',
'.cpp': 'cpp',
'.cc': 'cpp',
'.cxx': 'cpp',
'.c': 'c',
'.h': 'c',
'.hpp': 'cpp',
'.swift': 'swift',
'.kt': 'kotlin',
'.sh': 'shell',
'.bash': 'shell',
'.sql': 'sql',
'.html': 'html',
'.htm': 'html',
'.css': 'css',
'.scss': 'css',
'.less': 'css',
'.json': 'json',
'.yaml': 'yaml',
'.yml': 'yaml',
'.md': 'markdown',
'.xml': 'xml',
'.toml': 'toml',
'.graphql': 'graphql',
'.gql': 'graphql',
'.proto': 'protobuf',
'.tf': 'terraform',
'.ps1': 'powershell',
'.bat': 'batch',
'.cmd': 'batch',
'.txt': 'text',
'.rst': 'restructuredtext',
'.adoc': 'asciidoc',
'.env': 'env',
'.ini': 'ini',
'.cfg': 'ini',
};
/**
* Extension-to-category mapping.
*/
const EXTENSION_CATEGORY_MAP = {
// code
'.ts': 'code',
'.tsx': 'code',
'.js': 'code',
'.jsx': 'code',
'.mjs': 'code',
'.cjs': 'code',
'.py': 'code',
'.go': 'code',
'.rs': 'code',
'.java': 'code',
'.rb': 'code',
'.php': 'code',
'.cs': 'code',
'.cpp': 'code',
'.cc': 'code',
'.cxx': 'code',
'.c': 'code',
'.h': 'code',
'.hpp': 'code',
'.swift': 'code',
'.kt': 'code',
// config
'.json': 'config',
'.yaml': 'config',
'.yml': 'config',
'.toml': 'config',
'.xml': 'config',
'.env': 'config',
'.ini': 'config',
'.cfg': 'config',
// docs
'.md': 'docs',
'.txt': 'docs',
'.rst': 'docs',
'.adoc': 'docs',
// infra
'.tf': 'infra',
// data
'.sql': 'data',
'.graphql': 'data',
'.gql': 'data',
'.proto': 'data',
// script
'.sh': 'script',
'.bash': 'script',
'.ps1': 'script',
'.bat': 'script',
'.cmd': 'script',
// markup
'.html': 'markup',
'.htm': 'markup',
'.css': 'markup',
'.scss': 'markup',
'.less': 'markup',
'.svg': 'markup',
};
/**
* Filenames that indicate infra category regardless of extension.
*/
const INFRA_FILENAMES = [
'dockerfile',
'docker-compose.yml',
'docker-compose.yaml',
];
/**
* Reads the .understand-anything/.understandignore file and returns patterns.
* Returns an empty array if the file doesn't exist.
*/
function readUnderstandIgnore(projectRoot) {
const ignorePath = path.join(projectRoot, '.understand-anything', '.understandignore');
try {
const content = fs.readFileSync(ignorePath, 'utf-8');
return content
.split('\n')
.map(line => line.trim())
.filter(line => line.length > 0 && !line.startsWith('#'));
}
catch {
return [];
}
}
/**
* Checks if a filename matches a glob pattern (simple glob: supports * wildcard).
*/
function matchesGlobPattern(filename, pattern) {
// Convert glob pattern to regex
const escaped = pattern
.replace(/[.+^${}()|[\]\\]/g, '\\$&')
.replace(/\*/g, '.*')
.replace(/\?/g, '.');
const regex = new RegExp(`^${escaped}$`, 'i');
return regex.test(filename);
}
/**
* Checks if a relative path matches a glob pattern that may include directory separators.
*/
function matchesPathPattern(relativePath, pattern) {
// Normalize separators
const normalizedPath = relativePath.replace(/\\/g, '/');
const normalizedPattern = pattern.replace(/\\/g, '/');
// If pattern contains a slash, match against the full relative path
if (normalizedPattern.includes('/')) {
const escaped = normalizedPattern
.replace(/[.+^${}()|[\]\\]/g, '\\$&')
.replace(/\*\*/g, '{{GLOBSTAR}}')
.replace(/\*/g, '[^/]*')
.replace(/\{\{GLOBSTAR\}\}/g, '.*')
.replace(/\?/g, '.');
const regex = new RegExp(`^${escaped}$`, 'i');
// Also try matching as a prefix (for directory patterns like "src/generated/**")
if (regex.test(normalizedPath)) {
return true;
}
// If pattern ends with /**, also match the directory itself
if (normalizedPattern.endsWith('/**')) {
const dirPrefix = normalizedPattern.slice(0, -3);
if (normalizedPath.startsWith(dirPrefix + '/') || normalizedPath === dirPrefix) {
return true;
}
}
return false;
}
// Otherwise match against just the filename
return matchesGlobPattern(path.basename(relativePath), normalizedPattern);
}
/**
* Determines if a file should be excluded based on ignore patterns.
*/
function shouldExcludeFile(relativePath, filename, ignorePatterns) {
// Check default file patterns
for (const pattern of DEFAULT_IGNORE_FILE_PATTERNS) {
if (matchesGlobPattern(filename, pattern)) {
return true;
}
}
// Check custom ignore patterns
for (const pattern of ignorePatterns) {
if (matchesPathPattern(relativePath, pattern)) {
return true;
}
}
return false;
}
/**
* Detects the language of a file from its extension or filename.
*/
function detectLanguage(filePath) {
const basename = path.basename(filePath).toLowerCase();
// Special filename-based detection
if (basename === 'dockerfile' || basename.startsWith('dockerfile.')) {
return 'dockerfile';
}
const ext = path.extname(filePath).toLowerCase();
return EXTENSION_LANGUAGE_MAP[ext] || 'unknown';
}
/**
* Detects the category of a file based on its extension and path.
*/
function detectCategory(filePath) {
const basename = path.basename(filePath).toLowerCase();
const ext = path.extname(filePath).toLowerCase();
// Special filename-based detection for infra
if (INFRA_FILENAMES.includes(basename)) {
return 'infra';
}
// Dockerfile without extension
if (basename === 'dockerfile' || basename.startsWith('dockerfile.')) {
return 'infra';
}
// Check path patterns for infra (e.g., .k8s, .helm directories)
const normalizedPath = filePath.replace(/\\/g, '/');
if (normalizedPath.includes('.k8s/') || normalizedPath.includes('.helm/') ||
normalizedPath.includes('k8s/') || normalizedPath.includes('helm/')) {
return 'infra';
}
return EXTENSION_CATEGORY_MAP[ext] || 'code';
}
/**
* Counts the number of lines in a file.
*/
function countLines(filePath) {
try {
const content = fs.readFileSync(filePath, 'utf-8');
if (content.length === 0)
return 0;
return content.split('\n').length;
}
catch {
return 0;
}
}
/**
* Recursively scans a directory for files, respecting ignore patterns.
*/
function scanDirectory(dirPath, projectRoot, ignorePatterns, files) {
let entries;
try {
entries = fs.readdirSync(dirPath, { withFileTypes: true });
}
catch {
return;
}
for (const entry of entries) {
const fullPath = path.join(dirPath, entry.name);
const relativePath = path.relative(projectRoot, fullPath);
if (entry.isDirectory()) {
// Skip excluded directories
if (DEFAULT_IGNORE_DIRS.includes(entry.name)) {
continue;
}
// Check if directory matches any ignore pattern
const relDirPath = relativePath.replace(/\\/g, '/');
let skipDir = false;
for (const pattern of ignorePatterns) {
if (matchesPathPattern(relDirPath, pattern) || matchesPathPattern(relDirPath + '/', pattern)) {
skipDir = true;
break;
}
}
if (skipDir)
continue;
scanDirectory(fullPath, projectRoot, ignorePatterns, files);
}
else if (entry.isFile()) {
const relFilePath = relativePath.replace(/\\/g, '/');
// Check if file should be excluded
if (shouldExcludeFile(relFilePath, entry.name, ignorePatterns)) {
continue;
}
const language = detectLanguage(fullPath);
const category = detectCategory(relFilePath);
const lineCount = countLines(fullPath);
files.push({
path: fullPath,
relativePath: relFilePath,
category,
language,
lineCount,
});
}
}
}
/**
* Framework detection patterns based on config file names.
*/
const FRAMEWORK_FILE_PATTERNS = [
{ pattern: 'next.config', framework: 'Next.js' },
{ pattern: 'nuxt.config', framework: 'Nuxt' },
{ pattern: 'angular.json', framework: 'Angular' },
{ pattern: 'svelte.config', framework: 'Svelte' },
{ pattern: 'astro.config', framework: 'Astro' },
{ pattern: 'vite.config', framework: 'Vite' },
{ pattern: 'webpack.config', framework: 'Webpack' },
{ pattern: 'tailwind.config', framework: 'TailwindCSS' },
{ pattern: 'manage.py', framework: 'Django' },
];
/**
* Framework detection from package.json dependencies.
*/
const DEPENDENCY_FRAMEWORK_MAP = {
'react': 'React',
'vue': 'Vue',
'@angular/core': 'Angular',
'next': 'Next.js',
'express': 'Express',
'fastify': 'Fastify',
'@nestjs/core': 'NestJS',
'astro': 'Astro',
'svelte': 'Svelte',
'nuxt': 'Nuxt',
'vite': 'Vite',
};
/**
* Framework detection from Python dependencies.
*/
const PYTHON_FRAMEWORK_MAP = {
'django': 'Django',
'flask': 'Flask',
'fastapi': 'FastAPI',
};
/**
* Reads a file safely, returning null on failure.
*/
function readFileSafe(filePath) {
try {
return fs.readFileSync(filePath, 'utf-8');
}
catch {
return null;
}
}
/**
* Extracts a simple value from a TOML file using regex.
* Handles both quoted and unquoted values.
*/
function extractTomlValue(content, section, key) {
// Find the section
const sectionRegex = new RegExp(`\\[${section.replace('.', '\\.')}\\]`);
const sectionMatch = content.match(sectionRegex);
if (!sectionMatch || sectionMatch.index === undefined)
return '';
// Get content after section header until next section
const afterSection = content.slice(sectionMatch.index + sectionMatch[0].length);
const nextSectionIdx = afterSection.search(/^\[/m);
const sectionContent = nextSectionIdx >= 0 ? afterSection.slice(0, nextSectionIdx) : afterSection;
// Find the key
const keyRegex = new RegExp(`^${key}\\s*=\\s*"([^"]*)"`, 'm');
const keyMatch = sectionContent.match(keyRegex);
return keyMatch ? keyMatch[1] : '';
}
/**
* Extracts dependencies list from a TOML section (pyproject.toml).
*/
function extractTomlDependencies(content) {
// Look for dependencies in [project] section
const depsRegex = /\[project\][\s\S]*?dependencies\s*=\s*\[([\s\S]*?)\]/;
const match = content.match(depsRegex);
if (!match)
return [];
// Extract package names from the array
return match[1]
.split('\n')
.map(line => line.trim().replace(/^["']/, '').replace(/["'].*$/, ''))
.filter(line => line.length > 0 && !line.startsWith('#'))
.map(dep => dep.split(/[>=<!\s]/)[0].toLowerCase());
}
/**
* Detects project metadata from manifest files and scanned file entries.
*/
export function detectMetadata(projectRoot, files) {
let name = '';
let description = '';
const detectedFrameworks = new Set();
// 1. Try package.json first (highest priority)
const packageJsonContent = readFileSafe(path.join(projectRoot, 'package.json'));
if (packageJsonContent) {
try {
const pkg = JSON.parse(packageJsonContent);
name = pkg.name || '';
description = pkg.description || '';
// Detect frameworks from dependencies
const allDeps = {
...(pkg.dependencies || {}),
...(pkg.devDependencies || {}),
};
for (const [depName, framework] of Object.entries(DEPENDENCY_FRAMEWORK_MAP)) {
if (depName in allDeps) {
detectedFrameworks.add(framework);
}
}
// Detect TypeScript from dependencies
if ('typescript' in allDeps || Object.keys(allDeps).some(d => d.startsWith('@types/'))) {
// TypeScript is detected as a language, not a framework — handled in language detection
}
}
catch {
// Invalid JSON, skip
}
}
// 2. Try pyproject.toml
if (!name) {
const pyprojectContent = readFileSafe(path.join(projectRoot, 'pyproject.toml'));
if (pyprojectContent) {
name = extractTomlValue(pyprojectContent, 'project', 'name');
description = extractTomlValue(pyprojectContent, 'project', 'description');
// Detect Python frameworks from dependencies
const deps = extractTomlDependencies(pyprojectContent);
for (const [depName, framework] of Object.entries(PYTHON_FRAMEWORK_MAP)) {
if (deps.some(d => d === depName || d.startsWith(depName + '['))) {
detectedFrameworks.add(framework);
}
}
}
}
// 3. Try Cargo.toml
if (!name) {
const cargoContent = readFileSafe(path.join(projectRoot, 'Cargo.toml'));
if (cargoContent) {
name = extractTomlValue(cargoContent, 'package', 'name');
description = extractTomlValue(cargoContent, 'package', 'description');
}
}
// 4. Try go.mod
if (!name) {
const goModContent = readFileSafe(path.join(projectRoot, 'go.mod'));
if (goModContent) {
const moduleMatch = goModContent.match(/^module\s+(.+)$/m);
if (moduleMatch) {
name = moduleMatch[1].trim();
}
}
}
// 5. Try pom.xml
if (!name) {
const pomContent = readFileSafe(path.join(projectRoot, 'pom.xml'));
if (pomContent) {
const artifactIdMatch = pomContent.match(/<artifactId>([^<]+)<\/artifactId>/);
const nameMatch = pomContent.match(/<name>([^<]+)<\/name>/);
name = nameMatch ? nameMatch[1] : (artifactIdMatch ? artifactIdMatch[1] : '');
const descMatch = pomContent.match(/<description>([^<]+)<\/description>/);
description = descMatch ? descMatch[1] : '';
}
}
// 6. Detect languages from scanned files
const languageCounts = new Map();
for (const file of files) {
if (file.language && file.language !== 'unknown') {
languageCounts.set(file.language, (languageCounts.get(file.language) || 0) + 1);
}
}
// Sort by frequency (most common first)
const languages = [...languageCounts.entries()]
.sort((a, b) => b[1] - a[1])
.map(([lang]) => lang);
// 7. Detect frameworks from file patterns
for (const file of files) {
const basename = path.basename(file.relativePath);
for (const { pattern, framework } of FRAMEWORK_FILE_PATTERNS) {
if (pattern === basename || basename.startsWith(pattern + '.') || basename.startsWith(pattern)) {
// Exact match or starts with pattern (e.g., "next.config.mjs" starts with "next.config")
if (pattern.includes('.') ? basename.startsWith(pattern) : basename === pattern) {
detectedFrameworks.add(framework);
}
}
}
}
// Check for Flask/FastAPI from app.py or wsgi.py
const hasAppPy = files.some(f => path.basename(f.relativePath) === 'app.py');
const hasWsgiPy = files.some(f => path.basename(f.relativePath) === 'wsgi.py');
if (hasAppPy || hasWsgiPy) {
// Try to read the file to check imports
const targetFile = files.find(f => path.basename(f.relativePath) === 'app.py' || path.basename(f.relativePath) === 'wsgi.py');
if (targetFile) {
const content = readFileSafe(targetFile.path);
if (content) {
if (content.includes('from flask') || content.includes('import flask')) {
detectedFrameworks.add('Flask');
}
if (content.includes('from fastapi') || content.includes('import fastapi')) {
detectedFrameworks.add('FastAPI');
}
}
}
}
// Check for Rails from Gemfile
const gemfileContent = readFileSafe(path.join(projectRoot, 'Gemfile'));
if (gemfileContent) {
if (gemfileContent.includes("'rails'") || gemfileContent.includes('"rails"')) {
detectedFrameworks.add('Ruby on Rails');
}
}
// 8. Get git commit hash
let gitCommitHash = '';
try {
gitCommitHash = execSync('git rev-parse HEAD', {
cwd: projectRoot,
encoding: 'utf-8',
stdio: ['pipe', 'pipe', 'pipe'],
}).trim();
}
catch {
// Not a git repo or git not available
}
return {
name,
description,
languages,
frameworks: [...detectedFrameworks],
analyzedAt: new Date().toISOString(),
gitCommitHash,
};
}
/**
* Scans a project directory for source files, excluding common non-source
* directories and respecting ignore patterns.
*
* @param projectRoot - Absolute path to the project root directory
* @param ignorePatterns - Additional glob patterns to exclude
* @returns ScanResult with discovered files and detected project metadata
*/
export function scanProject(projectRoot, ignorePatterns = []) {
// Read .understand-anything/.understandignore patterns
const understandIgnorePatterns = readUnderstandIgnore(projectRoot);
// Combine all ignore patterns
const allIgnorePatterns = [...ignorePatterns, ...understandIgnorePatterns];
// Scan the directory tree
const files = [];
scanDirectory(projectRoot, projectRoot, allIgnorePatterns, files);
// Detect project metadata from manifest files and scanned files
const metadata = detectMetadata(projectRoot, files);
return {
files,
metadata,
};
}