Spaces:

mr4
/

knowledge-graph-preview

Running

App Files Files Community

knowledge-graph-preview / cli /analyzer /scanner.js

mr4's picture

Upload 136 files

fd8cdf5 verified 2 days ago

history blame contribute delete

19.4 kB

	import * as fs from 'node:fs';
	import * as path from 'node:path';
	import { execSync } from 'node:child_process';
	/**
	* Default directories to exclude from scanning.
	*/
	export const DEFAULT_IGNORE_DIRS = [
	'node_modules',
	'.git',
	'dist',
	'build',
	'vendor',
	'__pycache__',
	'.next',
	'.cache',
	'.turbo',
	'target',
	'obj',
	'.understand-anything',
	];
	/**
	* Default file patterns to exclude from scanning (binary, generated, lock files).
	*/
	export const DEFAULT_IGNORE_FILE_PATTERNS = [
	'*.lock',
	'*.min.js',
	'*.min.css',
	'*.map',
	'*.png',
	'*.jpg',
	'*.jpeg',
	'*.gif',
	'*.svg',
	'*.ico',
	'*.woff',
	'*.woff2',
	'*.ttf',
	'*.eot',
	'*.mp3',
	'*.mp4',
	'*.webm',
	'*.zip',
	'*.tar',
	'*.gz',
	'*.pdf',
	'*.exe',
	'*.dll',
	'*.so',
	'*.dylib',
	];
	/**
	* Extension-to-language mapping.
	*/
	const EXTENSION_LANGUAGE_MAP = {
	'.ts': 'typescript',
	'.tsx': 'typescript',
	'.js': 'javascript',
	'.jsx': 'javascript',
	'.mjs': 'javascript',
	'.cjs': 'javascript',
	'.py': 'python',
	'.go': 'go',
	'.rs': 'rust',
	'.java': 'java',
	'.rb': 'ruby',
	'.php': 'php',
	'.cs': 'csharp',
	'.cpp': 'cpp',
	'.cc': 'cpp',
	'.cxx': 'cpp',
	'.c': 'c',
	'.h': 'c',
	'.hpp': 'cpp',
	'.swift': 'swift',
	'.kt': 'kotlin',
	'.sh': 'shell',
	'.bash': 'shell',
	'.sql': 'sql',
	'.html': 'html',
	'.htm': 'html',
	'.css': 'css',
	'.scss': 'css',
	'.less': 'css',
	'.json': 'json',
	'.yaml': 'yaml',
	'.yml': 'yaml',
	'.md': 'markdown',
	'.xml': 'xml',
	'.toml': 'toml',
	'.graphql': 'graphql',
	'.gql': 'graphql',
	'.proto': 'protobuf',
	'.tf': 'terraform',
	'.ps1': 'powershell',
	'.bat': 'batch',
	'.cmd': 'batch',
	'.txt': 'text',
	'.rst': 'restructuredtext',
	'.adoc': 'asciidoc',
	'.env': 'env',
	'.ini': 'ini',
	'.cfg': 'ini',
	};
	/**
	* Extension-to-category mapping.
	*/
	const EXTENSION_CATEGORY_MAP = {
	// code
	'.ts': 'code',
	'.tsx': 'code',
	'.js': 'code',
	'.jsx': 'code',
	'.mjs': 'code',
	'.cjs': 'code',
	'.py': 'code',
	'.go': 'code',
	'.rs': 'code',
	'.java': 'code',
	'.rb': 'code',
	'.php': 'code',
	'.cs': 'code',
	'.cpp': 'code',
	'.cc': 'code',
	'.cxx': 'code',
	'.c': 'code',
	'.h': 'code',
	'.hpp': 'code',
	'.swift': 'code',
	'.kt': 'code',
	// config
	'.json': 'config',
	'.yaml': 'config',
	'.yml': 'config',
	'.toml': 'config',
	'.xml': 'config',
	'.env': 'config',
	'.ini': 'config',
	'.cfg': 'config',
	// docs
	'.md': 'docs',
	'.txt': 'docs',
	'.rst': 'docs',
	'.adoc': 'docs',
	// infra
	'.tf': 'infra',
	// data
	'.sql': 'data',
	'.graphql': 'data',
	'.gql': 'data',
	'.proto': 'data',
	// script
	'.sh': 'script',
	'.bash': 'script',
	'.ps1': 'script',
	'.bat': 'script',
	'.cmd': 'script',
	// markup
	'.html': 'markup',
	'.htm': 'markup',
	'.css': 'markup',
	'.scss': 'markup',
	'.less': 'markup',
	'.svg': 'markup',
	};
	/**
	* Filenames that indicate infra category regardless of extension.
	*/
	const INFRA_FILENAMES = [
	'dockerfile',
	'docker-compose.yml',
	'docker-compose.yaml',
	];
	/**
	* Reads the .understand-anything/.understandignore file and returns patterns.
	* Returns an empty array if the file doesn't exist.
	*/
	function readUnderstandIgnore(projectRoot) {
	const ignorePath = path.join(projectRoot, '.understand-anything', '.understandignore');
	try {
	const content = fs.readFileSync(ignorePath, 'utf-8');
	return content
	.split('\n')
	.map(line => line.trim())
	.filter(line => line.length > 0 && !line.startsWith('#'));
	}
	catch {
	return [];
	}
	}
	/**
	* Checks if a filename matches a glob pattern (simple glob: supports * wildcard).
	*/
	function matchesGlobPattern(filename, pattern) {
	// Convert glob pattern to regex
	const escaped = pattern
	.replace(/[.+^${}()\|[\]\\]/g, '\\$&')
	.replace(/\/g, '.')
	.replace(/\?/g, '.');
	const regex = new RegExp(`^${escaped}$`, 'i');
	return regex.test(filename);
	}
	/**
	* Checks if a relative path matches a glob pattern that may include directory separators.
	*/
	function matchesPathPattern(relativePath, pattern) {
	// Normalize separators
	const normalizedPath = relativePath.replace(/\\/g, '/');
	const normalizedPattern = pattern.replace(/\\/g, '/');
	// If pattern contains a slash, match against the full relative path
	if (normalizedPattern.includes('/')) {
	const escaped = normalizedPattern
	.replace(/[.+^${}()\|[\]\\]/g, '\\$&')
	.replace(/\\/g, '{{GLOBSTAR}}')
	.replace(/\/g, '[^/]')
	.replace(/\{\{GLOBSTAR\}\}/g, '.*')
	.replace(/\?/g, '.');
	const regex = new RegExp(`^${escaped}$`, 'i');
	// Also try matching as a prefix (for directory patterns like "src/generated/**")
	if (regex.test(normalizedPath)) {
	return true;
	}
	// If pattern ends with /**, also match the directory itself
	if (normalizedPattern.endsWith('/**')) {
	const dirPrefix = normalizedPattern.slice(0, -3);
	if (normalizedPath.startsWith(dirPrefix + '/') \|\| normalizedPath === dirPrefix) {
	return true;
	}
	}
	return false;
	}
	// Otherwise match against just the filename
	return matchesGlobPattern(path.basename(relativePath), normalizedPattern);
	}
	/**
	* Determines if a file should be excluded based on ignore patterns.
	*/
	function shouldExcludeFile(relativePath, filename, ignorePatterns) {
	// Check default file patterns
	for (const pattern of DEFAULT_IGNORE_FILE_PATTERNS) {
	if (matchesGlobPattern(filename, pattern)) {
	return true;
	}
	}
	// Check custom ignore patterns
	for (const pattern of ignorePatterns) {
	if (matchesPathPattern(relativePath, pattern)) {
	return true;
	}
	}
	return false;
	}
	/**
	* Detects the language of a file from its extension or filename.
	*/
	function detectLanguage(filePath) {
	const basename = path.basename(filePath).toLowerCase();
	// Special filename-based detection
	if (basename === 'dockerfile' \|\| basename.startsWith('dockerfile.')) {
	return 'dockerfile';
	}
	const ext = path.extname(filePath).toLowerCase();
	return EXTENSION_LANGUAGE_MAP[ext] \|\| 'unknown';
	}
	/**
	* Detects the category of a file based on its extension and path.
	*/
	function detectCategory(filePath) {
	const basename = path.basename(filePath).toLowerCase();
	const ext = path.extname(filePath).toLowerCase();
	// Special filename-based detection for infra
	if (INFRA_FILENAMES.includes(basename)) {
	return 'infra';
	}
	// Dockerfile without extension
	if (basename === 'dockerfile' \|\| basename.startsWith('dockerfile.')) {
	return 'infra';
	}
	// Check path patterns for infra (e.g., .k8s, .helm directories)
	const normalizedPath = filePath.replace(/\\/g, '/');
	if (normalizedPath.includes('.k8s/') \|\| normalizedPath.includes('.helm/') \|\|
	normalizedPath.includes('k8s/') \|\| normalizedPath.includes('helm/')) {
	return 'infra';
	}
	return EXTENSION_CATEGORY_MAP[ext] \|\| 'code';
	}
	/**
	* Counts the number of lines in a file.
	*/
	function countLines(filePath) {
	try {
	const content = fs.readFileSync(filePath, 'utf-8');
	if (content.length === 0)
	return 0;
	return content.split('\n').length;
	}
	catch {
	return 0;
	}
	}
	/**
	* Recursively scans a directory for files, respecting ignore patterns.
	*/
	function scanDirectory(dirPath, projectRoot, ignorePatterns, files) {
	let entries;
	try {
	entries = fs.readdirSync(dirPath, { withFileTypes: true });
	}
	catch {
	return;
	}
	for (const entry of entries) {
	const fullPath = path.join(dirPath, entry.name);
	const relativePath = path.relative(projectRoot, fullPath);
	if (entry.isDirectory()) {
	// Skip excluded directories
	if (DEFAULT_IGNORE_DIRS.includes(entry.name)) {
	continue;
	}
	// Check if directory matches any ignore pattern
	const relDirPath = relativePath.replace(/\\/g, '/');
	let skipDir = false;
	for (const pattern of ignorePatterns) {
	if (matchesPathPattern(relDirPath, pattern) \|\| matchesPathPattern(relDirPath + '/', pattern)) {
	skipDir = true;
	break;
	}
	}
	if (skipDir)
	continue;
	scanDirectory(fullPath, projectRoot, ignorePatterns, files);
	}
	else if (entry.isFile()) {
	const relFilePath = relativePath.replace(/\\/g, '/');
	// Check if file should be excluded
	if (shouldExcludeFile(relFilePath, entry.name, ignorePatterns)) {
	continue;
	}
	const language = detectLanguage(fullPath);
	const category = detectCategory(relFilePath);
	const lineCount = countLines(fullPath);
	files.push({
	path: fullPath,
	relativePath: relFilePath,
	category,
	language,
	lineCount,
	});
	}
	}
	}
	/**
	* Framework detection patterns based on config file names.
	*/
	const FRAMEWORK_FILE_PATTERNS = [
	{ pattern: 'next.config', framework: 'Next.js' },
	{ pattern: 'nuxt.config', framework: 'Nuxt' },
	{ pattern: 'angular.json', framework: 'Angular' },
	{ pattern: 'svelte.config', framework: 'Svelte' },
	{ pattern: 'astro.config', framework: 'Astro' },
	{ pattern: 'vite.config', framework: 'Vite' },
	{ pattern: 'webpack.config', framework: 'Webpack' },
	{ pattern: 'tailwind.config', framework: 'TailwindCSS' },
	{ pattern: 'manage.py', framework: 'Django' },
	];
	/**
	* Framework detection from package.json dependencies.
	*/
	const DEPENDENCY_FRAMEWORK_MAP = {
	'react': 'React',
	'vue': 'Vue',
	'@angular/core': 'Angular',
	'next': 'Next.js',
	'express': 'Express',
	'fastify': 'Fastify',
	'@nestjs/core': 'NestJS',
	'astro': 'Astro',
	'svelte': 'Svelte',
	'nuxt': 'Nuxt',
	'vite': 'Vite',
	};
	/**
	* Framework detection from Python dependencies.
	*/
	const PYTHON_FRAMEWORK_MAP = {
	'django': 'Django',
	'flask': 'Flask',
	'fastapi': 'FastAPI',
	};
	/**
	* Reads a file safely, returning null on failure.
	*/
	function readFileSafe(filePath) {
	try {
	return fs.readFileSync(filePath, 'utf-8');
	}
	catch {
	return null;
	}
	}
	/**
	* Extracts a simple value from a TOML file using regex.
	* Handles both quoted and unquoted values.
	*/
	function extractTomlValue(content, section, key) {
	// Find the section
	const sectionRegex = new RegExp(`\\[${section.replace('.', '\\.')}\\]`);
	const sectionMatch = content.match(sectionRegex);
	if (!sectionMatch \|\| sectionMatch.index === undefined)
	return '';
	// Get content after section header until next section
	const afterSection = content.slice(sectionMatch.index + sectionMatch[0].length);
	const nextSectionIdx = afterSection.search(/^\[/m);
	const sectionContent = nextSectionIdx >= 0 ? afterSection.slice(0, nextSectionIdx) : afterSection;
	// Find the key
	const keyRegex = new RegExp(`^${key}\\s=\\s"([^"]*)"`, 'm');
	const keyMatch = sectionContent.match(keyRegex);
	return keyMatch ? keyMatch[1] : '';
	}
	/**
	* Extracts dependencies list from a TOML section (pyproject.toml).
	*/
	function extractTomlDependencies(content) {
	// Look for dependencies in [project] section
	const depsRegex = /\[project\][\s\S]?dependencies\s=\s\[([\s\S]?)\]/;
	const match = content.match(depsRegex);
	if (!match)
	return [];
	// Extract package names from the array
	return match[1]
	.split('\n')
	.map(line => line.trim().replace(/^["']/, '').replace(/["'].*$/, ''))
	.filter(line => line.length > 0 && !line.startsWith('#'))
	.map(dep => dep.split(/[>=<!\s]/)[0].toLowerCase());
	}
	/**
	* Detects project metadata from manifest files and scanned file entries.
	*/
	export function detectMetadata(projectRoot, files) {
	let name = '';
	let description = '';
	const detectedFrameworks = new Set();
	// 1. Try package.json first (highest priority)
	const packageJsonContent = readFileSafe(path.join(projectRoot, 'package.json'));
	if (packageJsonContent) {
	try {
	const pkg = JSON.parse(packageJsonContent);
	name = pkg.name \|\| '';
	description = pkg.description \|\| '';
	// Detect frameworks from dependencies
	const allDeps = {
	...(pkg.dependencies \|\| {}),
	...(pkg.devDependencies \|\| {}),
	};
	for (const [depName, framework] of Object.entries(DEPENDENCY_FRAMEWORK_MAP)) {
	if (depName in allDeps) {
	detectedFrameworks.add(framework);
	}
	}
	// Detect TypeScript from dependencies
	if ('typescript' in allDeps \|\| Object.keys(allDeps).some(d => d.startsWith('@types/'))) {
	// TypeScript is detected as a language, not a framework — handled in language detection
	}
	}
	catch {
	// Invalid JSON, skip
	}
	}
	// 2. Try pyproject.toml
	if (!name) {
	const pyprojectContent = readFileSafe(path.join(projectRoot, 'pyproject.toml'));
	if (pyprojectContent) {
	name = extractTomlValue(pyprojectContent, 'project', 'name');
	description = extractTomlValue(pyprojectContent, 'project', 'description');
	// Detect Python frameworks from dependencies
	const deps = extractTomlDependencies(pyprojectContent);
	for (const [depName, framework] of Object.entries(PYTHON_FRAMEWORK_MAP)) {
	if (deps.some(d => d === depName \|\| d.startsWith(depName + '['))) {
	detectedFrameworks.add(framework);
	}
	}
	}
	}
	// 3. Try Cargo.toml
	if (!name) {
	const cargoContent = readFileSafe(path.join(projectRoot, 'Cargo.toml'));
	if (cargoContent) {
	name = extractTomlValue(cargoContent, 'package', 'name');
	description = extractTomlValue(cargoContent, 'package', 'description');
	}
	}
	// 4. Try go.mod
	if (!name) {
	const goModContent = readFileSafe(path.join(projectRoot, 'go.mod'));
	if (goModContent) {
	const moduleMatch = goModContent.match(/^module\s+(.+)$/m);
	if (moduleMatch) {
	name = moduleMatch[1].trim();
	}
	}
	}
	// 5. Try pom.xml
	if (!name) {
	const pomContent = readFileSafe(path.join(projectRoot, 'pom.xml'));
	if (pomContent) {
	const artifactIdMatch = pomContent.match(/<artifactId>([^<]+)<\/artifactId>/);
	const nameMatch = pomContent.match(/<name>([^<]+)<\/name>/);
	name = nameMatch ? nameMatch[1] : (artifactIdMatch ? artifactIdMatch[1] : '');
	const descMatch = pomContent.match(/<description>([^<]+)<\/description>/);
	description = descMatch ? descMatch[1] : '';
	}
	}
	// 6. Detect languages from scanned files
	const languageCounts = new Map();
	for (const file of files) {
	if (file.language && file.language !== 'unknown') {
	languageCounts.set(file.language, (languageCounts.get(file.language) \|\| 0) + 1);
	}
	}
	// Sort by frequency (most common first)
	const languages = [...languageCounts.entries()]
	.sort((a, b) => b[1] - a[1])
	.map(([lang]) => lang);
	// 7. Detect frameworks from file patterns
	for (const file of files) {
	const basename = path.basename(file.relativePath);
	for (const { pattern, framework } of FRAMEWORK_FILE_PATTERNS) {
	if (pattern === basename \|\| basename.startsWith(pattern + '.') \|\| basename.startsWith(pattern)) {
	// Exact match or starts with pattern (e.g., "next.config.mjs" starts with "next.config")
	if (pattern.includes('.') ? basename.startsWith(pattern) : basename === pattern) {
	detectedFrameworks.add(framework);
	}
	}
	}
	}
	// Check for Flask/FastAPI from app.py or wsgi.py
	const hasAppPy = files.some(f => path.basename(f.relativePath) === 'app.py');
	const hasWsgiPy = files.some(f => path.basename(f.relativePath) === 'wsgi.py');
	if (hasAppPy \|\| hasWsgiPy) {
	// Try to read the file to check imports
	const targetFile = files.find(f => path.basename(f.relativePath) === 'app.py' \|\| path.basename(f.relativePath) === 'wsgi.py');
	if (targetFile) {
	const content = readFileSafe(targetFile.path);
	if (content) {
	if (content.includes('from flask') \|\| content.includes('import flask')) {
	detectedFrameworks.add('Flask');
	}
	if (content.includes('from fastapi') \|\| content.includes('import fastapi')) {
	detectedFrameworks.add('FastAPI');
	}
	}
	}
	}
	// Check for Rails from Gemfile
	const gemfileContent = readFileSafe(path.join(projectRoot, 'Gemfile'));
	if (gemfileContent) {
	if (gemfileContent.includes("'rails'") \|\| gemfileContent.includes('"rails"')) {
	detectedFrameworks.add('Ruby on Rails');
	}
	}
	// 8. Get git commit hash
	let gitCommitHash = '';
	try {
	gitCommitHash = execSync('git rev-parse HEAD', {
	cwd: projectRoot,
	encoding: 'utf-8',
	stdio: ['pipe', 'pipe', 'pipe'],
	}).trim();
	}
	catch {
	// Not a git repo or git not available
	}
	return {
	name,
	description,
	languages,
	frameworks: [...detectedFrameworks],
	analyzedAt: new Date().toISOString(),
	gitCommitHash,
	};
	}
	/**
	* Scans a project directory for source files, excluding common non-source
	* directories and respecting ignore patterns.
	*
	* @param projectRoot - Absolute path to the project root directory
	* @param ignorePatterns - Additional glob patterns to exclude
	* @returns ScanResult with discovered files and detected project metadata
	*/
	export function scanProject(projectRoot, ignorePatterns = []) {
	// Read .understand-anything/.understandignore patterns
	const understandIgnorePatterns = readUnderstandIgnore(projectRoot);
	// Combine all ignore patterns
	const allIgnorePatterns = [...ignorePatterns, ...understandIgnorePatterns];
	// Scan the directory tree
	const files = [];
	scanDirectory(projectRoot, projectRoot, allIgnorePatterns, files);
	// Detect project metadata from manifest files and scanned files
	const metadata = detectMetadata(projectRoot, files);
	return {
	files,
	metadata,
	};
	}