import * as fs from 'node:fs'; import * as path from 'node:path'; import { execSync } from 'node:child_process'; /** * Default directories to exclude from scanning. */ export const DEFAULT_IGNORE_DIRS = [ 'node_modules', '.git', 'dist', 'build', 'vendor', '__pycache__', '.next', '.cache', '.turbo', 'target', 'obj', '.understand-anything', ]; /** * Default file patterns to exclude from scanning (binary, generated, lock files). */ export const DEFAULT_IGNORE_FILE_PATTERNS = [ '*.lock', '*.min.js', '*.min.css', '*.map', '*.png', '*.jpg', '*.jpeg', '*.gif', '*.svg', '*.ico', '*.woff', '*.woff2', '*.ttf', '*.eot', '*.mp3', '*.mp4', '*.webm', '*.zip', '*.tar', '*.gz', '*.pdf', '*.exe', '*.dll', '*.so', '*.dylib', ]; /** * Extension-to-language mapping. */ const EXTENSION_LANGUAGE_MAP = { '.ts': 'typescript', '.tsx': 'typescript', '.js': 'javascript', '.jsx': 'javascript', '.mjs': 'javascript', '.cjs': 'javascript', '.py': 'python', '.go': 'go', '.rs': 'rust', '.java': 'java', '.rb': 'ruby', '.php': 'php', '.cs': 'csharp', '.cpp': 'cpp', '.cc': 'cpp', '.cxx': 'cpp', '.c': 'c', '.h': 'c', '.hpp': 'cpp', '.swift': 'swift', '.kt': 'kotlin', '.sh': 'shell', '.bash': 'shell', '.sql': 'sql', '.html': 'html', '.htm': 'html', '.css': 'css', '.scss': 'css', '.less': 'css', '.json': 'json', '.yaml': 'yaml', '.yml': 'yaml', '.md': 'markdown', '.xml': 'xml', '.toml': 'toml', '.graphql': 'graphql', '.gql': 'graphql', '.proto': 'protobuf', '.tf': 'terraform', '.ps1': 'powershell', '.bat': 'batch', '.cmd': 'batch', '.txt': 'text', '.rst': 'restructuredtext', '.adoc': 'asciidoc', '.env': 'env', '.ini': 'ini', '.cfg': 'ini', }; /** * Extension-to-category mapping. */ const EXTENSION_CATEGORY_MAP = { // code '.ts': 'code', '.tsx': 'code', '.js': 'code', '.jsx': 'code', '.mjs': 'code', '.cjs': 'code', '.py': 'code', '.go': 'code', '.rs': 'code', '.java': 'code', '.rb': 'code', '.php': 'code', '.cs': 'code', '.cpp': 'code', '.cc': 'code', '.cxx': 'code', '.c': 'code', '.h': 'code', '.hpp': 'code', '.swift': 'code', '.kt': 'code', // config '.json': 'config', '.yaml': 'config', '.yml': 'config', '.toml': 'config', '.xml': 'config', '.env': 'config', '.ini': 'config', '.cfg': 'config', // docs '.md': 'docs', '.txt': 'docs', '.rst': 'docs', '.adoc': 'docs', // infra '.tf': 'infra', // data '.sql': 'data', '.graphql': 'data', '.gql': 'data', '.proto': 'data', // script '.sh': 'script', '.bash': 'script', '.ps1': 'script', '.bat': 'script', '.cmd': 'script', // markup '.html': 'markup', '.htm': 'markup', '.css': 'markup', '.scss': 'markup', '.less': 'markup', '.svg': 'markup', }; /** * Filenames that indicate infra category regardless of extension. */ const INFRA_FILENAMES = [ 'dockerfile', 'docker-compose.yml', 'docker-compose.yaml', ]; /** * Reads the .understand-anything/.understandignore file and returns patterns. * Returns an empty array if the file doesn't exist. */ function readUnderstandIgnore(projectRoot) { const ignorePath = path.join(projectRoot, '.understand-anything', '.understandignore'); try { const content = fs.readFileSync(ignorePath, 'utf-8'); return content .split('\n') .map(line => line.trim()) .filter(line => line.length > 0 && !line.startsWith('#')); } catch { return []; } } /** * Checks if a filename matches a glob pattern (simple glob: supports * wildcard). */ function matchesGlobPattern(filename, pattern) { // Convert glob pattern to regex const escaped = pattern .replace(/[.+^${}()|[\]\\]/g, '\\$&') .replace(/\*/g, '.*') .replace(/\?/g, '.'); const regex = new RegExp(`^${escaped}$`, 'i'); return regex.test(filename); } /** * Checks if a relative path matches a glob pattern that may include directory separators. */ function matchesPathPattern(relativePath, pattern) { // Normalize separators const normalizedPath = relativePath.replace(/\\/g, '/'); const normalizedPattern = pattern.replace(/\\/g, '/'); // If pattern contains a slash, match against the full relative path if (normalizedPattern.includes('/')) { const escaped = normalizedPattern .replace(/[.+^${}()|[\]\\]/g, '\\$&') .replace(/\*\*/g, '{{GLOBSTAR}}') .replace(/\*/g, '[^/]*') .replace(/\{\{GLOBSTAR\}\}/g, '.*') .replace(/\?/g, '.'); const regex = new RegExp(`^${escaped}$`, 'i'); // Also try matching as a prefix (for directory patterns like "src/generated/**") if (regex.test(normalizedPath)) { return true; } // If pattern ends with /**, also match the directory itself if (normalizedPattern.endsWith('/**')) { const dirPrefix = normalizedPattern.slice(0, -3); if (normalizedPath.startsWith(dirPrefix + '/') || normalizedPath === dirPrefix) { return true; } } return false; } // Otherwise match against just the filename return matchesGlobPattern(path.basename(relativePath), normalizedPattern); } /** * Determines if a file should be excluded based on ignore patterns. */ function shouldExcludeFile(relativePath, filename, ignorePatterns) { // Check default file patterns for (const pattern of DEFAULT_IGNORE_FILE_PATTERNS) { if (matchesGlobPattern(filename, pattern)) { return true; } } // Check custom ignore patterns for (const pattern of ignorePatterns) { if (matchesPathPattern(relativePath, pattern)) { return true; } } return false; } /** * Detects the language of a file from its extension or filename. */ function detectLanguage(filePath) { const basename = path.basename(filePath).toLowerCase(); // Special filename-based detection if (basename === 'dockerfile' || basename.startsWith('dockerfile.')) { return 'dockerfile'; } const ext = path.extname(filePath).toLowerCase(); return EXTENSION_LANGUAGE_MAP[ext] || 'unknown'; } /** * Detects the category of a file based on its extension and path. */ function detectCategory(filePath) { const basename = path.basename(filePath).toLowerCase(); const ext = path.extname(filePath).toLowerCase(); // Special filename-based detection for infra if (INFRA_FILENAMES.includes(basename)) { return 'infra'; } // Dockerfile without extension if (basename === 'dockerfile' || basename.startsWith('dockerfile.')) { return 'infra'; } // Check path patterns for infra (e.g., .k8s, .helm directories) const normalizedPath = filePath.replace(/\\/g, '/'); if (normalizedPath.includes('.k8s/') || normalizedPath.includes('.helm/') || normalizedPath.includes('k8s/') || normalizedPath.includes('helm/')) { return 'infra'; } return EXTENSION_CATEGORY_MAP[ext] || 'code'; } /** * Counts the number of lines in a file. */ function countLines(filePath) { try { const content = fs.readFileSync(filePath, 'utf-8'); if (content.length === 0) return 0; return content.split('\n').length; } catch { return 0; } } /** * Recursively scans a directory for files, respecting ignore patterns. */ function scanDirectory(dirPath, projectRoot, ignorePatterns, files) { let entries; try { entries = fs.readdirSync(dirPath, { withFileTypes: true }); } catch { return; } for (const entry of entries) { const fullPath = path.join(dirPath, entry.name); const relativePath = path.relative(projectRoot, fullPath); if (entry.isDirectory()) { // Skip excluded directories if (DEFAULT_IGNORE_DIRS.includes(entry.name)) { continue; } // Check if directory matches any ignore pattern const relDirPath = relativePath.replace(/\\/g, '/'); let skipDir = false; for (const pattern of ignorePatterns) { if (matchesPathPattern(relDirPath, pattern) || matchesPathPattern(relDirPath + '/', pattern)) { skipDir = true; break; } } if (skipDir) continue; scanDirectory(fullPath, projectRoot, ignorePatterns, files); } else if (entry.isFile()) { const relFilePath = relativePath.replace(/\\/g, '/'); // Check if file should be excluded if (shouldExcludeFile(relFilePath, entry.name, ignorePatterns)) { continue; } const language = detectLanguage(fullPath); const category = detectCategory(relFilePath); const lineCount = countLines(fullPath); files.push({ path: fullPath, relativePath: relFilePath, category, language, lineCount, }); } } } /** * Framework detection patterns based on config file names. */ const FRAMEWORK_FILE_PATTERNS = [ { pattern: 'next.config', framework: 'Next.js' }, { pattern: 'nuxt.config', framework: 'Nuxt' }, { pattern: 'angular.json', framework: 'Angular' }, { pattern: 'svelte.config', framework: 'Svelte' }, { pattern: 'astro.config', framework: 'Astro' }, { pattern: 'vite.config', framework: 'Vite' }, { pattern: 'webpack.config', framework: 'Webpack' }, { pattern: 'tailwind.config', framework: 'TailwindCSS' }, { pattern: 'manage.py', framework: 'Django' }, ]; /** * Framework detection from package.json dependencies. */ const DEPENDENCY_FRAMEWORK_MAP = { 'react': 'React', 'vue': 'Vue', '@angular/core': 'Angular', 'next': 'Next.js', 'express': 'Express', 'fastify': 'Fastify', '@nestjs/core': 'NestJS', 'astro': 'Astro', 'svelte': 'Svelte', 'nuxt': 'Nuxt', 'vite': 'Vite', }; /** * Framework detection from Python dependencies. */ const PYTHON_FRAMEWORK_MAP = { 'django': 'Django', 'flask': 'Flask', 'fastapi': 'FastAPI', }; /** * Reads a file safely, returning null on failure. */ function readFileSafe(filePath) { try { return fs.readFileSync(filePath, 'utf-8'); } catch { return null; } } /** * Extracts a simple value from a TOML file using regex. * Handles both quoted and unquoted values. */ function extractTomlValue(content, section, key) { // Find the section const sectionRegex = new RegExp(`\\[${section.replace('.', '\\.')}\\]`); const sectionMatch = content.match(sectionRegex); if (!sectionMatch || sectionMatch.index === undefined) return ''; // Get content after section header until next section const afterSection = content.slice(sectionMatch.index + sectionMatch[0].length); const nextSectionIdx = afterSection.search(/^\[/m); const sectionContent = nextSectionIdx >= 0 ? afterSection.slice(0, nextSectionIdx) : afterSection; // Find the key const keyRegex = new RegExp(`^${key}\\s*=\\s*"([^"]*)"`, 'm'); const keyMatch = sectionContent.match(keyRegex); return keyMatch ? keyMatch[1] : ''; } /** * Extracts dependencies list from a TOML section (pyproject.toml). */ function extractTomlDependencies(content) { // Look for dependencies in [project] section const depsRegex = /\[project\][\s\S]*?dependencies\s*=\s*\[([\s\S]*?)\]/; const match = content.match(depsRegex); if (!match) return []; // Extract package names from the array return match[1] .split('\n') .map(line => line.trim().replace(/^["']/, '').replace(/["'].*$/, '')) .filter(line => line.length > 0 && !line.startsWith('#')) .map(dep => dep.split(/[>= d.startsWith('@types/'))) { // TypeScript is detected as a language, not a framework — handled in language detection } } catch { // Invalid JSON, skip } } // 2. Try pyproject.toml if (!name) { const pyprojectContent = readFileSafe(path.join(projectRoot, 'pyproject.toml')); if (pyprojectContent) { name = extractTomlValue(pyprojectContent, 'project', 'name'); description = extractTomlValue(pyprojectContent, 'project', 'description'); // Detect Python frameworks from dependencies const deps = extractTomlDependencies(pyprojectContent); for (const [depName, framework] of Object.entries(PYTHON_FRAMEWORK_MAP)) { if (deps.some(d => d === depName || d.startsWith(depName + '['))) { detectedFrameworks.add(framework); } } } } // 3. Try Cargo.toml if (!name) { const cargoContent = readFileSafe(path.join(projectRoot, 'Cargo.toml')); if (cargoContent) { name = extractTomlValue(cargoContent, 'package', 'name'); description = extractTomlValue(cargoContent, 'package', 'description'); } } // 4. Try go.mod if (!name) { const goModContent = readFileSafe(path.join(projectRoot, 'go.mod')); if (goModContent) { const moduleMatch = goModContent.match(/^module\s+(.+)$/m); if (moduleMatch) { name = moduleMatch[1].trim(); } } } // 5. Try pom.xml if (!name) { const pomContent = readFileSafe(path.join(projectRoot, 'pom.xml')); if (pomContent) { const artifactIdMatch = pomContent.match(/([^<]+)<\/artifactId>/); const nameMatch = pomContent.match(/([^<]+)<\/name>/); name = nameMatch ? nameMatch[1] : (artifactIdMatch ? artifactIdMatch[1] : ''); const descMatch = pomContent.match(/([^<]+)<\/description>/); description = descMatch ? descMatch[1] : ''; } } // 6. Detect languages from scanned files const languageCounts = new Map(); for (const file of files) { if (file.language && file.language !== 'unknown') { languageCounts.set(file.language, (languageCounts.get(file.language) || 0) + 1); } } // Sort by frequency (most common first) const languages = [...languageCounts.entries()] .sort((a, b) => b[1] - a[1]) .map(([lang]) => lang); // 7. Detect frameworks from file patterns for (const file of files) { const basename = path.basename(file.relativePath); for (const { pattern, framework } of FRAMEWORK_FILE_PATTERNS) { if (pattern === basename || basename.startsWith(pattern + '.') || basename.startsWith(pattern)) { // Exact match or starts with pattern (e.g., "next.config.mjs" starts with "next.config") if (pattern.includes('.') ? basename.startsWith(pattern) : basename === pattern) { detectedFrameworks.add(framework); } } } } // Check for Flask/FastAPI from app.py or wsgi.py const hasAppPy = files.some(f => path.basename(f.relativePath) === 'app.py'); const hasWsgiPy = files.some(f => path.basename(f.relativePath) === 'wsgi.py'); if (hasAppPy || hasWsgiPy) { // Try to read the file to check imports const targetFile = files.find(f => path.basename(f.relativePath) === 'app.py' || path.basename(f.relativePath) === 'wsgi.py'); if (targetFile) { const content = readFileSafe(targetFile.path); if (content) { if (content.includes('from flask') || content.includes('import flask')) { detectedFrameworks.add('Flask'); } if (content.includes('from fastapi') || content.includes('import fastapi')) { detectedFrameworks.add('FastAPI'); } } } } // Check for Rails from Gemfile const gemfileContent = readFileSafe(path.join(projectRoot, 'Gemfile')); if (gemfileContent) { if (gemfileContent.includes("'rails'") || gemfileContent.includes('"rails"')) { detectedFrameworks.add('Ruby on Rails'); } } // 8. Get git commit hash let gitCommitHash = ''; try { gitCommitHash = execSync('git rev-parse HEAD', { cwd: projectRoot, encoding: 'utf-8', stdio: ['pipe', 'pipe', 'pipe'], }).trim(); } catch { // Not a git repo or git not available } return { name, description, languages, frameworks: [...detectedFrameworks], analyzedAt: new Date().toISOString(), gitCommitHash, }; } /** * Scans a project directory for source files, excluding common non-source * directories and respecting ignore patterns. * * @param projectRoot - Absolute path to the project root directory * @param ignorePatterns - Additional glob patterns to exclude * @returns ScanResult with discovered files and detected project metadata */ export function scanProject(projectRoot, ignorePatterns = []) { // Read .understand-anything/.understandignore patterns const understandIgnorePatterns = readUnderstandIgnore(projectRoot); // Combine all ignore patterns const allIgnorePatterns = [...ignorePatterns, ...understandIgnorePatterns]; // Scan the directory tree const files = []; scanDirectory(projectRoot, projectRoot, allIgnorePatterns, files); // Detect project metadata from manifest files and scanned files const metadata = detectMetadata(projectRoot, files); return { files, metadata, }; }