Spaces:

Kraft102
/

widgetdc-cortex

Paused

App Files Files Community

widgetdc-cortex / apps /backend /src /services /ingestion /LocalFileScanner.ts

Kraft102

Initial deployment - WidgeTDC Cortex Backend v2.1.0

529090e 2 months ago

raw

history blame contribute delete

6.41 kB

	// LocalFileScanner – Scans local file system for documents and data
	import { promises as fs } from 'fs';
	import * as path from 'path';
	import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';

	export interface LocalFileScannerConfig {
	rootPaths: string[]; // Directories to scan
	extensions?: string[]; // File extensions to include (e.g., ['.txt', '.pdf', '.docx'])
	maxDepth?: number; // Maximum depth to recurse
	maxFileSize?: number; // Maximum file size in bytes (default: 10MB)
	excludePatterns?: string[]; // Glob patterns to exclude
	}

	export class LocalFileScanner implements DataSourceAdapter {
	name = 'Local File Scanner';
	type = 'local_files' as const;

	private config: LocalFileScannerConfig;

	constructor(config: LocalFileScannerConfig) {
	this.config = {
	maxDepth: 3,
	maxFileSize: 10 * 1024 * 1024, // 10MB
	extensions: ['.txt', '.md', '.pdf', '.docx', '.xlsx', '.csv', '.json'],
	excludePatterns: ['node_modules', '.git', 'dist', 'build'],
	...config
	};
	}

	async isAvailable(): Promise<boolean> {
	// Check if at least one root path exists
	for (const rootPath of this.config.rootPaths) {
	try {
	await fs.access(rootPath);
	return true;
	} catch {
	continue;
	}
	}
	return false;
	}

	async fetch(): Promise<any[]> {
	const files: any[] = [];

	for (const rootPath of this.config.rootPaths) {
	try {
	await this.walkDirectory(rootPath, 0, files);
	} catch (error: any) {
	console.warn(`Failed to scan ${rootPath}:`, error.message);
	}
	}

	return files;
	}

	private async walkDirectory(dirPath: string, depth: number, results: any[]): Promise<void> {
	if (this.config.maxDepth && depth > this.config.maxDepth) {
	return;
	}

	try {
	const entries = await fs.readdir(dirPath, { withFileTypes: true });

	for (const entry of entries) {
	const fullPath = path.join(dirPath, entry.name);

	// Check exclusion patterns
	if (this.shouldExclude(fullPath)) {
	continue;
	}

	if (entry.isDirectory()) {
	await this.walkDirectory(fullPath, depth + 1, results);
	} else if (entry.isFile()) {
	// Check extension
	if (this.config.extensions && this.config.extensions.length > 0) {
	const ext = path.extname(entry.name).toLowerCase();
	if (!this.config.extensions.includes(ext)) {
	continue;
	}
	}

	// Check file size
	const stat = await fs.stat(fullPath);
	if (this.config.maxFileSize && stat.size > this.config.maxFileSize) {
	console.warn(`Skipping large file: ${fullPath} (${stat.size} bytes)`);
	continue;
	}

	results.push({
	path: fullPath,
	name: entry.name,
	size: stat.size,
	modifiedAt: stat.mtime,
	extension: path.extname(entry.name).toLowerCase()
	});
	}
	}
	} catch (error: any) {
	console.warn(`Failed to read directory ${dirPath}:`, error.message);
	}
	}

	private shouldExclude(filePath: string): boolean {
	if (!this.config.excludePatterns) return false;

	return this.config.excludePatterns.some(pattern =>
	filePath.includes(pattern)
	);
	}

	async transform(files: any[]): Promise<IngestedEntity[]> {
	const entities: IngestedEntity[] = [];

	for (const file of files) {
	try {
	// For text files, read content
	let content = '';
	if (['.txt', '.md', '.json', '.csv'].includes(file.extension)) {
	try {
	content = await fs.readFile(file.path, 'utf-8');
	} catch {
	content = '(unable to read)';
	}
	} else if (file.extension === '.pdf') {
	try {
	const dataBuffer = await fs.readFile(file.path);
	// pdf-parse v2.x uses a class-based API
	const { PDFParse } = await import('pdf-parse');
	const parser = new PDFParse({ data: dataBuffer });
	const textResult = await parser.getText();
	content = textResult.text;
	await parser.destroy();
	} catch (e) {
	console.warn(`Failed to parse PDF ${file.path}:`, e);
	content = '(unable to read pdf)';
	}
	}

	// Truncate if too long (except for PDFs where we might want more context, but keeping safety limit)
	if (content.length > 50000) {
	content = content.substring(0, 50000) + '... (truncated)';
	}

	entities.push({
	id: file.path,
	type: 'local_file',
	source: 'Local File Scanner',
	title: file.name,
	content,
	metadata: {
	path: file.path,
	size: file.size,
	extension: file.extension,
	// Convert Date to ISO string for Neo4j compatibility
	modifiedAt: file.modifiedAt instanceof Date
	? file.modifiedAt.toISOString()
	: String(file.modifiedAt),
	},
	timestamp: new Date()
	});
	} catch (error: any) {
	console.warn(`Failed to transform file ${file.path}:`, error.message);
	}
	}

	return entities;
	}
	}