Spaces:
Running
Running
| import * as fs from 'node:fs'; | |
| import * as path from 'node:path'; | |
| import { execSync } from 'node:child_process'; | |
| /** | |
| * Default directories to exclude from scanning. | |
| */ | |
| export const DEFAULT_IGNORE_DIRS = [ | |
| 'node_modules', | |
| '.git', | |
| 'dist', | |
| 'build', | |
| 'vendor', | |
| '__pycache__', | |
| '.next', | |
| '.cache', | |
| '.turbo', | |
| 'target', | |
| 'obj', | |
| '.understand-anything', | |
| ]; | |
| /** | |
| * Default file patterns to exclude from scanning (binary, generated, lock files). | |
| */ | |
| export const DEFAULT_IGNORE_FILE_PATTERNS = [ | |
| '*.lock', | |
| '*.min.js', | |
| '*.min.css', | |
| '*.map', | |
| '*.png', | |
| '*.jpg', | |
| '*.jpeg', | |
| '*.gif', | |
| '*.svg', | |
| '*.ico', | |
| '*.woff', | |
| '*.woff2', | |
| '*.ttf', | |
| '*.eot', | |
| '*.mp3', | |
| '*.mp4', | |
| '*.webm', | |
| '*.zip', | |
| '*.tar', | |
| '*.gz', | |
| '*.pdf', | |
| '*.exe', | |
| '*.dll', | |
| '*.so', | |
| '*.dylib', | |
| ]; | |
| /** | |
| * Extension-to-language mapping. | |
| */ | |
| const EXTENSION_LANGUAGE_MAP = { | |
| '.ts': 'typescript', | |
| '.tsx': 'typescript', | |
| '.js': 'javascript', | |
| '.jsx': 'javascript', | |
| '.mjs': 'javascript', | |
| '.cjs': 'javascript', | |
| '.py': 'python', | |
| '.go': 'go', | |
| '.rs': 'rust', | |
| '.java': 'java', | |
| '.rb': 'ruby', | |
| '.php': 'php', | |
| '.cs': 'csharp', | |
| '.cpp': 'cpp', | |
| '.cc': 'cpp', | |
| '.cxx': 'cpp', | |
| '.c': 'c', | |
| '.h': 'c', | |
| '.hpp': 'cpp', | |
| '.swift': 'swift', | |
| '.kt': 'kotlin', | |
| '.sh': 'shell', | |
| '.bash': 'shell', | |
| '.sql': 'sql', | |
| '.html': 'html', | |
| '.htm': 'html', | |
| '.css': 'css', | |
| '.scss': 'css', | |
| '.less': 'css', | |
| '.json': 'json', | |
| '.yaml': 'yaml', | |
| '.yml': 'yaml', | |
| '.md': 'markdown', | |
| '.xml': 'xml', | |
| '.toml': 'toml', | |
| '.graphql': 'graphql', | |
| '.gql': 'graphql', | |
| '.proto': 'protobuf', | |
| '.tf': 'terraform', | |
| '.ps1': 'powershell', | |
| '.bat': 'batch', | |
| '.cmd': 'batch', | |
| '.txt': 'text', | |
| '.rst': 'restructuredtext', | |
| '.adoc': 'asciidoc', | |
| '.env': 'env', | |
| '.ini': 'ini', | |
| '.cfg': 'ini', | |
| }; | |
| /** | |
| * Extension-to-category mapping. | |
| */ | |
| const EXTENSION_CATEGORY_MAP = { | |
| // code | |
| '.ts': 'code', | |
| '.tsx': 'code', | |
| '.js': 'code', | |
| '.jsx': 'code', | |
| '.mjs': 'code', | |
| '.cjs': 'code', | |
| '.py': 'code', | |
| '.go': 'code', | |
| '.rs': 'code', | |
| '.java': 'code', | |
| '.rb': 'code', | |
| '.php': 'code', | |
| '.cs': 'code', | |
| '.cpp': 'code', | |
| '.cc': 'code', | |
| '.cxx': 'code', | |
| '.c': 'code', | |
| '.h': 'code', | |
| '.hpp': 'code', | |
| '.swift': 'code', | |
| '.kt': 'code', | |
| // config | |
| '.json': 'config', | |
| '.yaml': 'config', | |
| '.yml': 'config', | |
| '.toml': 'config', | |
| '.xml': 'config', | |
| '.env': 'config', | |
| '.ini': 'config', | |
| '.cfg': 'config', | |
| // docs | |
| '.md': 'docs', | |
| '.txt': 'docs', | |
| '.rst': 'docs', | |
| '.adoc': 'docs', | |
| // infra | |
| '.tf': 'infra', | |
| // data | |
| '.sql': 'data', | |
| '.graphql': 'data', | |
| '.gql': 'data', | |
| '.proto': 'data', | |
| // script | |
| '.sh': 'script', | |
| '.bash': 'script', | |
| '.ps1': 'script', | |
| '.bat': 'script', | |
| '.cmd': 'script', | |
| // markup | |
| '.html': 'markup', | |
| '.htm': 'markup', | |
| '.css': 'markup', | |
| '.scss': 'markup', | |
| '.less': 'markup', | |
| '.svg': 'markup', | |
| }; | |
| /** | |
| * Filenames that indicate infra category regardless of extension. | |
| */ | |
| const INFRA_FILENAMES = [ | |
| 'dockerfile', | |
| 'docker-compose.yml', | |
| 'docker-compose.yaml', | |
| ]; | |
| /** | |
| * Reads the .understand-anything/.understandignore file and returns patterns. | |
| * Returns an empty array if the file doesn't exist. | |
| */ | |
| function readUnderstandIgnore(projectRoot) { | |
| const ignorePath = path.join(projectRoot, '.understand-anything', '.understandignore'); | |
| try { | |
| const content = fs.readFileSync(ignorePath, 'utf-8'); | |
| return content | |
| .split('\n') | |
| .map(line => line.trim()) | |
| .filter(line => line.length > 0 && !line.startsWith('#')); | |
| } | |
| catch { | |
| return []; | |
| } | |
| } | |
| /** | |
| * Checks if a filename matches a glob pattern (simple glob: supports * wildcard). | |
| */ | |
| function matchesGlobPattern(filename, pattern) { | |
| // Convert glob pattern to regex | |
| const escaped = pattern | |
| .replace(/[.+^${}()|[\]\\]/g, '\\$&') | |
| .replace(/\*/g, '.*') | |
| .replace(/\?/g, '.'); | |
| const regex = new RegExp(`^${escaped}$`, 'i'); | |
| return regex.test(filename); | |
| } | |
| /** | |
| * Checks if a relative path matches a glob pattern that may include directory separators. | |
| */ | |
| function matchesPathPattern(relativePath, pattern) { | |
| // Normalize separators | |
| const normalizedPath = relativePath.replace(/\\/g, '/'); | |
| const normalizedPattern = pattern.replace(/\\/g, '/'); | |
| // If pattern contains a slash, match against the full relative path | |
| if (normalizedPattern.includes('/')) { | |
| const escaped = normalizedPattern | |
| .replace(/[.+^${}()|[\]\\]/g, '\\$&') | |
| .replace(/\*\*/g, '{{GLOBSTAR}}') | |
| .replace(/\*/g, '[^/]*') | |
| .replace(/\{\{GLOBSTAR\}\}/g, '.*') | |
| .replace(/\?/g, '.'); | |
| const regex = new RegExp(`^${escaped}$`, 'i'); | |
| // Also try matching as a prefix (for directory patterns like "src/generated/**") | |
| if (regex.test(normalizedPath)) { | |
| return true; | |
| } | |
| // If pattern ends with /**, also match the directory itself | |
| if (normalizedPattern.endsWith('/**')) { | |
| const dirPrefix = normalizedPattern.slice(0, -3); | |
| if (normalizedPath.startsWith(dirPrefix + '/') || normalizedPath === dirPrefix) { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| // Otherwise match against just the filename | |
| return matchesGlobPattern(path.basename(relativePath), normalizedPattern); | |
| } | |
| /** | |
| * Determines if a file should be excluded based on ignore patterns. | |
| */ | |
| function shouldExcludeFile(relativePath, filename, ignorePatterns) { | |
| // Check default file patterns | |
| for (const pattern of DEFAULT_IGNORE_FILE_PATTERNS) { | |
| if (matchesGlobPattern(filename, pattern)) { | |
| return true; | |
| } | |
| } | |
| // Check custom ignore patterns | |
| for (const pattern of ignorePatterns) { | |
| if (matchesPathPattern(relativePath, pattern)) { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| /** | |
| * Detects the language of a file from its extension or filename. | |
| */ | |
| function detectLanguage(filePath) { | |
| const basename = path.basename(filePath).toLowerCase(); | |
| // Special filename-based detection | |
| if (basename === 'dockerfile' || basename.startsWith('dockerfile.')) { | |
| return 'dockerfile'; | |
| } | |
| const ext = path.extname(filePath).toLowerCase(); | |
| return EXTENSION_LANGUAGE_MAP[ext] || 'unknown'; | |
| } | |
| /** | |
| * Detects the category of a file based on its extension and path. | |
| */ | |
| function detectCategory(filePath) { | |
| const basename = path.basename(filePath).toLowerCase(); | |
| const ext = path.extname(filePath).toLowerCase(); | |
| // Special filename-based detection for infra | |
| if (INFRA_FILENAMES.includes(basename)) { | |
| return 'infra'; | |
| } | |
| // Dockerfile without extension | |
| if (basename === 'dockerfile' || basename.startsWith('dockerfile.')) { | |
| return 'infra'; | |
| } | |
| // Check path patterns for infra (e.g., .k8s, .helm directories) | |
| const normalizedPath = filePath.replace(/\\/g, '/'); | |
| if (normalizedPath.includes('.k8s/') || normalizedPath.includes('.helm/') || | |
| normalizedPath.includes('k8s/') || normalizedPath.includes('helm/')) { | |
| return 'infra'; | |
| } | |
| return EXTENSION_CATEGORY_MAP[ext] || 'code'; | |
| } | |
| /** | |
| * Counts the number of lines in a file. | |
| */ | |
| function countLines(filePath) { | |
| try { | |
| const content = fs.readFileSync(filePath, 'utf-8'); | |
| if (content.length === 0) | |
| return 0; | |
| return content.split('\n').length; | |
| } | |
| catch { | |
| return 0; | |
| } | |
| } | |
| /** | |
| * Recursively scans a directory for files, respecting ignore patterns. | |
| */ | |
| function scanDirectory(dirPath, projectRoot, ignorePatterns, files) { | |
| let entries; | |
| try { | |
| entries = fs.readdirSync(dirPath, { withFileTypes: true }); | |
| } | |
| catch { | |
| return; | |
| } | |
| for (const entry of entries) { | |
| const fullPath = path.join(dirPath, entry.name); | |
| const relativePath = path.relative(projectRoot, fullPath); | |
| if (entry.isDirectory()) { | |
| // Skip excluded directories | |
| if (DEFAULT_IGNORE_DIRS.includes(entry.name)) { | |
| continue; | |
| } | |
| // Check if directory matches any ignore pattern | |
| const relDirPath = relativePath.replace(/\\/g, '/'); | |
| let skipDir = false; | |
| for (const pattern of ignorePatterns) { | |
| if (matchesPathPattern(relDirPath, pattern) || matchesPathPattern(relDirPath + '/', pattern)) { | |
| skipDir = true; | |
| break; | |
| } | |
| } | |
| if (skipDir) | |
| continue; | |
| scanDirectory(fullPath, projectRoot, ignorePatterns, files); | |
| } | |
| else if (entry.isFile()) { | |
| const relFilePath = relativePath.replace(/\\/g, '/'); | |
| // Check if file should be excluded | |
| if (shouldExcludeFile(relFilePath, entry.name, ignorePatterns)) { | |
| continue; | |
| } | |
| const language = detectLanguage(fullPath); | |
| const category = detectCategory(relFilePath); | |
| const lineCount = countLines(fullPath); | |
| files.push({ | |
| path: fullPath, | |
| relativePath: relFilePath, | |
| category, | |
| language, | |
| lineCount, | |
| }); | |
| } | |
| } | |
| } | |
| /** | |
| * Framework detection patterns based on config file names. | |
| */ | |
| const FRAMEWORK_FILE_PATTERNS = [ | |
| { pattern: 'next.config', framework: 'Next.js' }, | |
| { pattern: 'nuxt.config', framework: 'Nuxt' }, | |
| { pattern: 'angular.json', framework: 'Angular' }, | |
| { pattern: 'svelte.config', framework: 'Svelte' }, | |
| { pattern: 'astro.config', framework: 'Astro' }, | |
| { pattern: 'vite.config', framework: 'Vite' }, | |
| { pattern: 'webpack.config', framework: 'Webpack' }, | |
| { pattern: 'tailwind.config', framework: 'TailwindCSS' }, | |
| { pattern: 'manage.py', framework: 'Django' }, | |
| ]; | |
| /** | |
| * Framework detection from package.json dependencies. | |
| */ | |
| const DEPENDENCY_FRAMEWORK_MAP = { | |
| 'react': 'React', | |
| 'vue': 'Vue', | |
| '@angular/core': 'Angular', | |
| 'next': 'Next.js', | |
| 'express': 'Express', | |
| 'fastify': 'Fastify', | |
| '@nestjs/core': 'NestJS', | |
| 'astro': 'Astro', | |
| 'svelte': 'Svelte', | |
| 'nuxt': 'Nuxt', | |
| 'vite': 'Vite', | |
| }; | |
| /** | |
| * Framework detection from Python dependencies. | |
| */ | |
| const PYTHON_FRAMEWORK_MAP = { | |
| 'django': 'Django', | |
| 'flask': 'Flask', | |
| 'fastapi': 'FastAPI', | |
| }; | |
| /** | |
| * Reads a file safely, returning null on failure. | |
| */ | |
| function readFileSafe(filePath) { | |
| try { | |
| return fs.readFileSync(filePath, 'utf-8'); | |
| } | |
| catch { | |
| return null; | |
| } | |
| } | |
| /** | |
| * Extracts a simple value from a TOML file using regex. | |
| * Handles both quoted and unquoted values. | |
| */ | |
| function extractTomlValue(content, section, key) { | |
| // Find the section | |
| const sectionRegex = new RegExp(`\\[${section.replace('.', '\\.')}\\]`); | |
| const sectionMatch = content.match(sectionRegex); | |
| if (!sectionMatch || sectionMatch.index === undefined) | |
| return ''; | |
| // Get content after section header until next section | |
| const afterSection = content.slice(sectionMatch.index + sectionMatch[0].length); | |
| const nextSectionIdx = afterSection.search(/^\[/m); | |
| const sectionContent = nextSectionIdx >= 0 ? afterSection.slice(0, nextSectionIdx) : afterSection; | |
| // Find the key | |
| const keyRegex = new RegExp(`^${key}\\s*=\\s*"([^"]*)"`, 'm'); | |
| const keyMatch = sectionContent.match(keyRegex); | |
| return keyMatch ? keyMatch[1] : ''; | |
| } | |
| /** | |
| * Extracts dependencies list from a TOML section (pyproject.toml). | |
| */ | |
| function extractTomlDependencies(content) { | |
| // Look for dependencies in [project] section | |
| const depsRegex = /\[project\][\s\S]*?dependencies\s*=\s*\[([\s\S]*?)\]/; | |
| const match = content.match(depsRegex); | |
| if (!match) | |
| return []; | |
| // Extract package names from the array | |
| return match[1] | |
| .split('\n') | |
| .map(line => line.trim().replace(/^["']/, '').replace(/["'].*$/, '')) | |
| .filter(line => line.length > 0 && !line.startsWith('#')) | |
| .map(dep => dep.split(/[>=<!\s]/)[0].toLowerCase()); | |
| } | |
| /** | |
| * Detects project metadata from manifest files and scanned file entries. | |
| */ | |
| export function detectMetadata(projectRoot, files) { | |
| let name = ''; | |
| let description = ''; | |
| const detectedFrameworks = new Set(); | |
| // 1. Try package.json first (highest priority) | |
| const packageJsonContent = readFileSafe(path.join(projectRoot, 'package.json')); | |
| if (packageJsonContent) { | |
| try { | |
| const pkg = JSON.parse(packageJsonContent); | |
| name = pkg.name || ''; | |
| description = pkg.description || ''; | |
| // Detect frameworks from dependencies | |
| const allDeps = { | |
| ...(pkg.dependencies || {}), | |
| ...(pkg.devDependencies || {}), | |
| }; | |
| for (const [depName, framework] of Object.entries(DEPENDENCY_FRAMEWORK_MAP)) { | |
| if (depName in allDeps) { | |
| detectedFrameworks.add(framework); | |
| } | |
| } | |
| // Detect TypeScript from dependencies | |
| if ('typescript' in allDeps || Object.keys(allDeps).some(d => d.startsWith('@types/'))) { | |
| // TypeScript is detected as a language, not a framework — handled in language detection | |
| } | |
| } | |
| catch { | |
| // Invalid JSON, skip | |
| } | |
| } | |
| // 2. Try pyproject.toml | |
| if (!name) { | |
| const pyprojectContent = readFileSafe(path.join(projectRoot, 'pyproject.toml')); | |
| if (pyprojectContent) { | |
| name = extractTomlValue(pyprojectContent, 'project', 'name'); | |
| description = extractTomlValue(pyprojectContent, 'project', 'description'); | |
| // Detect Python frameworks from dependencies | |
| const deps = extractTomlDependencies(pyprojectContent); | |
| for (const [depName, framework] of Object.entries(PYTHON_FRAMEWORK_MAP)) { | |
| if (deps.some(d => d === depName || d.startsWith(depName + '['))) { | |
| detectedFrameworks.add(framework); | |
| } | |
| } | |
| } | |
| } | |
| // 3. Try Cargo.toml | |
| if (!name) { | |
| const cargoContent = readFileSafe(path.join(projectRoot, 'Cargo.toml')); | |
| if (cargoContent) { | |
| name = extractTomlValue(cargoContent, 'package', 'name'); | |
| description = extractTomlValue(cargoContent, 'package', 'description'); | |
| } | |
| } | |
| // 4. Try go.mod | |
| if (!name) { | |
| const goModContent = readFileSafe(path.join(projectRoot, 'go.mod')); | |
| if (goModContent) { | |
| const moduleMatch = goModContent.match(/^module\s+(.+)$/m); | |
| if (moduleMatch) { | |
| name = moduleMatch[1].trim(); | |
| } | |
| } | |
| } | |
| // 5. Try pom.xml | |
| if (!name) { | |
| const pomContent = readFileSafe(path.join(projectRoot, 'pom.xml')); | |
| if (pomContent) { | |
| const artifactIdMatch = pomContent.match(/<artifactId>([^<]+)<\/artifactId>/); | |
| const nameMatch = pomContent.match(/<name>([^<]+)<\/name>/); | |
| name = nameMatch ? nameMatch[1] : (artifactIdMatch ? artifactIdMatch[1] : ''); | |
| const descMatch = pomContent.match(/<description>([^<]+)<\/description>/); | |
| description = descMatch ? descMatch[1] : ''; | |
| } | |
| } | |
| // 6. Detect languages from scanned files | |
| const languageCounts = new Map(); | |
| for (const file of files) { | |
| if (file.language && file.language !== 'unknown') { | |
| languageCounts.set(file.language, (languageCounts.get(file.language) || 0) + 1); | |
| } | |
| } | |
| // Sort by frequency (most common first) | |
| const languages = [...languageCounts.entries()] | |
| .sort((a, b) => b[1] - a[1]) | |
| .map(([lang]) => lang); | |
| // 7. Detect frameworks from file patterns | |
| for (const file of files) { | |
| const basename = path.basename(file.relativePath); | |
| for (const { pattern, framework } of FRAMEWORK_FILE_PATTERNS) { | |
| if (pattern === basename || basename.startsWith(pattern + '.') || basename.startsWith(pattern)) { | |
| // Exact match or starts with pattern (e.g., "next.config.mjs" starts with "next.config") | |
| if (pattern.includes('.') ? basename.startsWith(pattern) : basename === pattern) { | |
| detectedFrameworks.add(framework); | |
| } | |
| } | |
| } | |
| } | |
| // Check for Flask/FastAPI from app.py or wsgi.py | |
| const hasAppPy = files.some(f => path.basename(f.relativePath) === 'app.py'); | |
| const hasWsgiPy = files.some(f => path.basename(f.relativePath) === 'wsgi.py'); | |
| if (hasAppPy || hasWsgiPy) { | |
| // Try to read the file to check imports | |
| const targetFile = files.find(f => path.basename(f.relativePath) === 'app.py' || path.basename(f.relativePath) === 'wsgi.py'); | |
| if (targetFile) { | |
| const content = readFileSafe(targetFile.path); | |
| if (content) { | |
| if (content.includes('from flask') || content.includes('import flask')) { | |
| detectedFrameworks.add('Flask'); | |
| } | |
| if (content.includes('from fastapi') || content.includes('import fastapi')) { | |
| detectedFrameworks.add('FastAPI'); | |
| } | |
| } | |
| } | |
| } | |
| // Check for Rails from Gemfile | |
| const gemfileContent = readFileSafe(path.join(projectRoot, 'Gemfile')); | |
| if (gemfileContent) { | |
| if (gemfileContent.includes("'rails'") || gemfileContent.includes('"rails"')) { | |
| detectedFrameworks.add('Ruby on Rails'); | |
| } | |
| } | |
| // 8. Get git commit hash | |
| let gitCommitHash = ''; | |
| try { | |
| gitCommitHash = execSync('git rev-parse HEAD', { | |
| cwd: projectRoot, | |
| encoding: 'utf-8', | |
| stdio: ['pipe', 'pipe', 'pipe'], | |
| }).trim(); | |
| } | |
| catch { | |
| // Not a git repo or git not available | |
| } | |
| return { | |
| name, | |
| description, | |
| languages, | |
| frameworks: [...detectedFrameworks], | |
| analyzedAt: new Date().toISOString(), | |
| gitCommitHash, | |
| }; | |
| } | |
| /** | |
| * Scans a project directory for source files, excluding common non-source | |
| * directories and respecting ignore patterns. | |
| * | |
| * @param projectRoot - Absolute path to the project root directory | |
| * @param ignorePatterns - Additional glob patterns to exclude | |
| * @returns ScanResult with discovered files and detected project metadata | |
| */ | |
| export function scanProject(projectRoot, ignorePatterns = []) { | |
| // Read .understand-anything/.understandignore patterns | |
| const understandIgnorePatterns = readUnderstandIgnore(projectRoot); | |
| // Combine all ignore patterns | |
| const allIgnorePatterns = [...ignorePatterns, ...understandIgnorePatterns]; | |
| // Scan the directory tree | |
| const files = []; | |
| scanDirectory(projectRoot, projectRoot, allIgnorePatterns, files); | |
| // Detect project metadata from manifest files and scanned files | |
| const metadata = detectMetadata(projectRoot, files); | |
| return { | |
| files, | |
| metadata, | |
| }; | |
| } | |