openCLI / packages /core /src /utils /fileUtils.ts

Upload folder using huggingface_hub

40e575e verified 8 months ago

9.68 kB

	/**
	* @license
	* Copyright 2025 Google LLC
	* SPDX-License-Identifier: Apache-2.0
	*/

	import fs from 'fs';
	import path from 'path';
	import { PartUnion } from '@google/genai';
	import mime from 'mime-types';

	// Constants for text file processing
	const DEFAULT_MAX_LINES_TEXT_FILE = 2000;
	const MAX_LINE_LENGTH_TEXT_FILE = 2000;

	// Default values for encoding and separator format
	export const DEFAULT_ENCODING: BufferEncoding = 'utf-8';

	/**
	* Looks up the specific MIME type for a file path.
	* @param filePath Path to the file.
	* @returns The specific MIME type string (e.g., 'text/python', 'application/javascript') or undefined if not found or ambiguous.
	*/
	export function getSpecificMimeType(filePath: string): string \| undefined {
	const lookedUpMime = mime.lookup(filePath);
	return typeof lookedUpMime === 'string' ? lookedUpMime : undefined;
	}

	/**
	* Checks if a path is within a given root directory.
	* @param pathToCheck The absolute path to check.
	* @param rootDirectory The absolute root directory.
	* @returns True if the path is within the root directory, false otherwise.
	*/
	export function isWithinRoot(
	pathToCheck: string,
	rootDirectory: string,
	): boolean {
	const normalizedPathToCheck = path.normalize(pathToCheck);
	const normalizedRootDirectory = path.normalize(rootDirectory);

	// Ensure the rootDirectory path ends with a separator for correct startsWith comparison,
	// unless it's the root path itself (e.g., '/' or 'C:\').
	const rootWithSeparator =
	normalizedRootDirectory === path.sep \|\|
	normalizedRootDirectory.endsWith(path.sep)
	? normalizedRootDirectory
	: normalizedRootDirectory + path.sep;

	return (
	normalizedPathToCheck === normalizedRootDirectory \|\|
	normalizedPathToCheck.startsWith(rootWithSeparator)
	);
	}

	/**
	* Determines if a file is likely binary based on content sampling.
	* @param filePath Path to the file.
	* @returns True if the file appears to be binary.
	*/
	export function isBinaryFile(filePath: string): boolean {
	try {
	const fd = fs.openSync(filePath, 'r');
	// Read up to 4KB or file size, whichever is smaller
	const fileSize = fs.fstatSync(fd).size;
	if (fileSize === 0) {
	// Empty file is not considered binary for content checking
	fs.closeSync(fd);
	return false;
	}
	const bufferSize = Math.min(4096, fileSize);
	const buffer = Buffer.alloc(bufferSize);
	const bytesRead = fs.readSync(fd, buffer, 0, buffer.length, 0);
	fs.closeSync(fd);

	if (bytesRead === 0) return false;

	let nonPrintableCount = 0;
	for (let i = 0; i < bytesRead; i++) {
	if (buffer[i] === 0) return true; // Null byte is a strong indicator
	if (buffer[i] < 9 \|\| (buffer[i] > 13 && buffer[i] < 32)) {
	nonPrintableCount++;
	}
	}
	// If >30% non-printable characters, consider it binary
	return nonPrintableCount / bytesRead > 0.3;
	} catch {
	// If any error occurs (e.g. file not found, permissions),
	// treat as not binary here; let higher-level functions handle existence/access errors.
	return false;
	}
	}

	/**
	* Detects the type of file based on extension and content.
	* @param filePath Path to the file.
	* @returns 'text', 'image', 'pdf', or 'binary'.
	*/
	export function detectFileType(
	filePath: string,
	): 'text' \| 'image' \| 'pdf' \| 'binary' {
	const ext = path.extname(filePath).toLowerCase();
	const lookedUpMimeType = mime.lookup(filePath); // Returns false if not found, or the mime type string

	if (lookedUpMimeType && lookedUpMimeType.startsWith('image/')) {
	return 'image';
	}
	if (lookedUpMimeType && lookedUpMimeType === 'application/pdf') {
	return 'pdf';
	}

	// Stricter binary check for common non-text extensions before content check
	// These are often not well-covered by mime-types or might be misidentified.
	if (
	[
	'.zip',
	'.tar',
	'.gz',
	'.exe',
	'.dll',
	'.so',
	'.class',
	'.jar',
	'.war',
	'.7z',
	'.doc',
	'.docx',
	'.xls',
	'.xlsx',
	'.ppt',
	'.pptx',
	'.odt',
	'.ods',
	'.odp',
	'.bin',
	'.dat',
	'.obj',
	'.o',
	'.a',
	'.lib',
	'.wasm',
	'.pyc',
	'.pyo',
	].includes(ext)
	) {
	return 'binary';
	}

	// Fallback to content-based check if mime type wasn't conclusive for image/pdf
	// and it's not a known binary extension.
	if (isBinaryFile(filePath)) {
	return 'binary';
	}

	return 'text';
	}

	export interface ProcessedFileReadResult {
	llmContent: PartUnion; // string for text, Part for image/pdf/unreadable binary
	returnDisplay: string;
	error?: string; // Optional error message for the LLM if file processing failed
	isTruncated?: boolean; // For text files, indicates if content was truncated
	originalLineCount?: number; // For text files
	linesShown?: [number, number]; // For text files [startLine, endLine] (1-based for display)
	}

	/**
	* Reads and processes a single file, handling text, images, and PDFs.
	* @param filePath Absolute path to the file.
	* @param rootDirectory Absolute path to the project root for relative path display.
	* @param offset Optional offset for text files (0-based line number).
	* @param limit Optional limit for text files (number of lines to read).
	* @returns ProcessedFileReadResult object.
	*/
	export async function processSingleFileContent(
	filePath: string,
	rootDirectory: string,
	offset?: number,
	limit?: number,
	): Promise<ProcessedFileReadResult> {
	try {
	if (!fs.existsSync(filePath)) {
	// Sync check is acceptable before async read
	return {
	llmContent: '',
	returnDisplay: 'File not found.',
	error: `File not found: ${filePath}`,
	};
	}
	const stats = fs.statSync(filePath); // Sync check
	if (stats.isDirectory()) {
	return {
	llmContent: '',
	returnDisplay: 'Path is a directory.',
	error: `Path is a directory, not a file: ${filePath}`,
	};
	}

	const fileType = detectFileType(filePath);
	const relativePathForDisplay = path
	.relative(rootDirectory, filePath)
	.replace(/\\/g, '/');

	switch (fileType) {
	case 'binary': {
	return {
	llmContent: `Cannot display content of binary file: ${relativePathForDisplay}`,
	returnDisplay: `Skipped binary file: ${relativePathForDisplay}`,
	};
	}
	case 'text': {
	const content = await fs.promises.readFile(filePath, 'utf8');
	const lines = content.split('\n');
	const originalLineCount = lines.length;

	const startLine = offset \|\| 0;
	const effectiveLimit =
	limit === undefined ? DEFAULT_MAX_LINES_TEXT_FILE : limit;
	// Ensure endLine does not exceed originalLineCount
	const endLine = Math.min(startLine + effectiveLimit, originalLineCount);
	// Ensure selectedLines doesn't try to slice beyond array bounds if startLine is too high
	const actualStartLine = Math.min(startLine, originalLineCount);
	const selectedLines = lines.slice(actualStartLine, endLine);

	let linesWereTruncatedInLength = false;
	const formattedLines = selectedLines.map((line) => {
	if (line.length > MAX_LINE_LENGTH_TEXT_FILE) {
	linesWereTruncatedInLength = true;
	return (
	line.substring(0, MAX_LINE_LENGTH_TEXT_FILE) + '... [truncated]'
	);
	}
	return line;
	});

	const contentRangeTruncated = endLine < originalLineCount;
	const isTruncated = contentRangeTruncated \|\| linesWereTruncatedInLength;

	let llmTextContent = '';
	if (contentRangeTruncated) {
	llmTextContent += `[File content truncated: showing lines ${actualStartLine + 1}-${endLine} of ${originalLineCount} total lines. Use offset/limit parameters to view more.]\n`;
	} else if (linesWereTruncatedInLength) {
	llmTextContent += `[File content partially truncated: some lines exceeded maximum length of ${MAX_LINE_LENGTH_TEXT_FILE} characters.]\n`;
	}
	llmTextContent += formattedLines.join('\n');

	return {
	llmContent: llmTextContent,
	returnDisplay: isTruncated ? '(truncated)' : '',
	isTruncated,
	originalLineCount,
	linesShown: [actualStartLine + 1, endLine],
	};
	}
	case 'image':
	case 'pdf': {
	const contentBuffer = await fs.promises.readFile(filePath);
	const base64Data = contentBuffer.toString('base64');
	return {
	llmContent: {
	inlineData: {
	data: base64Data,
	mimeType: mime.lookup(filePath) \|\| 'application/octet-stream',
	},
	},
	returnDisplay: `Read ${fileType} file: ${relativePathForDisplay}`,
	};
	}
	default: {
	// Should not happen with current detectFileType logic
	const exhaustiveCheck: never = fileType;
	return {
	llmContent: `Unhandled file type: ${exhaustiveCheck}`,
	returnDisplay: `Skipped unhandled file type: ${relativePathForDisplay}`,
	error: `Unhandled file type for ${filePath}`,
	};
	}
	}
	} catch (error) {
	const errorMessage = error instanceof Error ? error.message : String(error);
	const displayPath = path
	.relative(rootDirectory, filePath)
	.replace(/\\/g, '/');
	return {
	llmContent: `Error reading file ${displayPath}: ${errorMessage}`,
	returnDisplay: `Error reading file ${displayPath}: ${errorMessage}`,
	error: `Error reading file ${filePath}: ${errorMessage}`,
	};
	}
	}