openCLI / packages /core /src /utils /editCorrector.ts

Upload folder using huggingface_hub

40e575e verified 8 months ago

22.2 kB

	/**
	* @license
	* Copyright 2025 Google LLC
	* SPDX-License-Identifier: Apache-2.0
	*/

	import {
	Content,
	GenerateContentConfig,
	SchemaUnion,
	Type,
	} from '@google/genai';
	import { GeminiClient } from '../core/client.js';
	import { EditToolParams } from '../tools/edit.js';
	import { LruCache } from './LruCache.js';
	import { DEFAULT_GEMINI_FLASH_MODEL } from '../config/models.js';

	const EditModel = DEFAULT_GEMINI_FLASH_MODEL;
	const EditConfig: GenerateContentConfig = {
	thinkingConfig: {
	thinkingBudget: 0,
	},
	};

	const MAX_CACHE_SIZE = 50;

	// Cache for ensureCorrectEdit results
	const editCorrectionCache = new LruCache<string, CorrectedEditResult>(
	MAX_CACHE_SIZE,
	);

	// Cache for ensureCorrectFileContent results
	const fileContentCorrectionCache = new LruCache<string, string>(MAX_CACHE_SIZE);

	/**
	* Defines the structure of the parameters within CorrectedEditResult
	*/
	interface CorrectedEditParams {
	file_path: string;
	old_string: string;
	new_string: string;
	}

	/**
	* Defines the result structure for ensureCorrectEdit.
	*/
	export interface CorrectedEditResult {
	params: CorrectedEditParams;
	occurrences: number;
	}

	/**
	* Attempts to correct edit parameters if the original old_string is not found.
	* It tries unescaping, and then LLM-based correction.
	* Results are cached to avoid redundant processing.
	*
	* @param currentContent The current content of the file.
	* @param originalParams The original EditToolParams
	* @param client The GeminiClient for LLM calls.
	* @returns A promise resolving to an object containing the (potentially corrected)
	* EditToolParams (as CorrectedEditParams) and the final occurrences count.
	*/
	export async function ensureCorrectEdit(
	currentContent: string,
	originalParams: EditToolParams, // This is the EditToolParams from edit.ts, without \'corrected\'
	client: GeminiClient,
	abortSignal: AbortSignal,
	): Promise<CorrectedEditResult> {
	const cacheKey = `${currentContent}---${originalParams.old_string}---${originalParams.new_string}`;
	const cachedResult = editCorrectionCache.get(cacheKey);
	if (cachedResult) {
	return cachedResult;
	}

	let finalNewString = originalParams.new_string;
	const newStringPotentiallyEscaped =
	unescapeStringForGeminiBug(originalParams.new_string) !==
	originalParams.new_string;

	const expectedReplacements = originalParams.expected_replacements ?? 1;

	let finalOldString = originalParams.old_string;
	let occurrences = countOccurrences(currentContent, finalOldString);

	if (occurrences === expectedReplacements) {
	if (newStringPotentiallyEscaped) {
	finalNewString = await correctNewStringEscaping(
	client,
	finalOldString,
	originalParams.new_string,
	abortSignal,
	);
	}
	} else if (occurrences > expectedReplacements) {
	const expectedReplacements = originalParams.expected_replacements ?? 1;

	// If user expects multiple replacements, return as-is
	if (occurrences === expectedReplacements) {
	const result: CorrectedEditResult = {
	params: { ...originalParams },
	occurrences,
	};
	editCorrectionCache.set(cacheKey, result);
	return result;
	}

	// If user expects 1 but found multiple, try to correct (existing behavior)
	if (expectedReplacements === 1) {
	const result: CorrectedEditResult = {
	params: { ...originalParams },
	occurrences,
	};
	editCorrectionCache.set(cacheKey, result);
	return result;
	}

	// If occurrences don't match expected, return as-is (will fail validation later)
	const result: CorrectedEditResult = {
	params: { ...originalParams },
	occurrences,
	};
	editCorrectionCache.set(cacheKey, result);
	return result;
	} else {
	// occurrences is 0 or some other unexpected state initially
	const unescapedOldStringAttempt = unescapeStringForGeminiBug(
	originalParams.old_string,
	);
	occurrences = countOccurrences(currentContent, unescapedOldStringAttempt);

	if (occurrences === expectedReplacements) {
	finalOldString = unescapedOldStringAttempt;
	if (newStringPotentiallyEscaped) {
	finalNewString = await correctNewString(
	client,
	originalParams.old_string, // original old
	unescapedOldStringAttempt, // corrected old
	originalParams.new_string, // original new (which is potentially escaped)
	abortSignal,
	);
	}
	} else if (occurrences === 0) {
	const llmCorrectedOldString = await correctOldStringMismatch(
	client,
	currentContent,
	unescapedOldStringAttempt,
	abortSignal,
	);
	const llmOldOccurrences = countOccurrences(
	currentContent,
	llmCorrectedOldString,
	);

	if (llmOldOccurrences === expectedReplacements) {
	finalOldString = llmCorrectedOldString;
	occurrences = llmOldOccurrences;

	if (newStringPotentiallyEscaped) {
	const baseNewStringForLLMCorrection = unescapeStringForGeminiBug(
	originalParams.new_string,
	);
	finalNewString = await correctNewString(
	client,
	originalParams.old_string, // original old
	llmCorrectedOldString, // corrected old
	baseNewStringForLLMCorrection, // base new for correction
	abortSignal,
	);
	}
	} else {
	// LLM correction also failed for old_string
	const result: CorrectedEditResult = {
	params: { ...originalParams },
	occurrences: 0, // Explicitly 0 as LLM failed
	};
	editCorrectionCache.set(cacheKey, result);
	return result;
	}
	} else {
	// Unescaping old_string resulted in > 1 occurrences
	const result: CorrectedEditResult = {
	params: { ...originalParams },
	occurrences, // This will be > 1
	};
	editCorrectionCache.set(cacheKey, result);
	return result;
	}
	}

	const { targetString, pair } = trimPairIfPossible(
	finalOldString,
	finalNewString,
	currentContent,
	expectedReplacements,
	);
	finalOldString = targetString;
	finalNewString = pair;

	// Final result construction
	const result: CorrectedEditResult = {
	params: {
	file_path: originalParams.file_path,
	old_string: finalOldString,
	new_string: finalNewString,
	},
	occurrences: countOccurrences(currentContent, finalOldString), // Recalculate occurrences with the final old_string
	};
	editCorrectionCache.set(cacheKey, result);
	return result;
	}

	export async function ensureCorrectFileContent(
	content: string,
	client: GeminiClient,
	abortSignal: AbortSignal,
	): Promise<string> {
	const cachedResult = fileContentCorrectionCache.get(content);
	if (cachedResult) {
	return cachedResult;
	}

	const contentPotentiallyEscaped =
	unescapeStringForGeminiBug(content) !== content;
	if (!contentPotentiallyEscaped) {
	fileContentCorrectionCache.set(content, content);
	return content;
	}

	const correctedContent = await correctStringEscaping(
	content,
	client,
	abortSignal,
	);
	fileContentCorrectionCache.set(content, correctedContent);
	return correctedContent;
	}

	// Define the expected JSON schema for the LLM response for old_string correction
	const OLD_STRING_CORRECTION_SCHEMA: SchemaUnion = {
	type: Type.OBJECT,
	properties: {
	corrected_target_snippet: {
	type: Type.STRING,
	description:
	'The corrected version of the target snippet that exactly and uniquely matches a segment within the provided file content.',
	},
	},
	required: ['corrected_target_snippet'],
	};

	export async function correctOldStringMismatch(
	geminiClient: GeminiClient,
	fileContent: string,
	problematicSnippet: string,
	abortSignal: AbortSignal,
	): Promise<string> {
	const prompt = `
	Context: A process needs to find an exact literal, unique match for a specific text snippet within a file's content. The provided snippet failed to match exactly. This is most likely because it has been overly escaped.

	Task: Analyze the provided file content and the problematic target snippet. Identify the segment in the file content that the snippet was most likely intended to match. Output the exact, literal text of that segment from the file content. Focus only on removing extra escape characters and correcting formatting, whitespace, or minor differences to achieve a PERFECT literal match. The output must be the exact literal text as it appears in the file.

	Problematic target snippet:
	\`\`\`
	${problematicSnippet}
	\`\`\`

	File Content:
	\`\`\`
	${fileContent}
	\`\`\`

	For example, if the problematic target snippet was "\\\\\\nconst greeting = \`Hello \\\\\`\${name}\\\\\`\`;" and the file content had content that looked like "\nconst greeting = \`Hello ${'\\`'}\${name}${'\\`'}\`;", then corrected_target_snippet should likely be "\nconst greeting = \`Hello ${'\\`'}\${name}${'\\`'}\`;" to fix the incorrect escaping to match the original file content.
	If the differences are only in whitespace or formatting, apply similar whitespace/formatting changes to the corrected_target_snippet.

	Return ONLY the corrected target snippet in the specified JSON format with the key 'corrected_target_snippet'. If no clear, unique match can be found, return an empty string for 'corrected_target_snippet'.
	`.trim();

	const contents: Content[] = [{ role: 'user', parts: [{ text: prompt }] }];

	try {
	const result = await geminiClient.generateJson(
	contents,
	OLD_STRING_CORRECTION_SCHEMA,
	abortSignal,
	EditModel,
	EditConfig,
	);

	if (
	result &&
	typeof result.corrected_target_snippet === 'string' &&
	result.corrected_target_snippet.length > 0
	) {
	return result.corrected_target_snippet;
	} else {
	return problematicSnippet;
	}
	} catch (error) {
	if (abortSignal.aborted) {
	throw error;
	}

	console.error(
	'Error during LLM call for old string snippet correction:',
	error,
	);

	return problematicSnippet;
	}
	}

	// Define the expected JSON schema for the new_string correction LLM response
	const NEW_STRING_CORRECTION_SCHEMA: SchemaUnion = {
	type: Type.OBJECT,
	properties: {
	corrected_new_string: {
	type: Type.STRING,
	description:
	'The original_new_string adjusted to be a suitable replacement for the corrected_old_string, while maintaining the original intent of the change.',
	},
	},
	required: ['corrected_new_string'],
	};

	/**
	* Adjusts the new_string to align with a corrected old_string, maintaining the original intent.
	*/
	export async function correctNewString(
	geminiClient: GeminiClient,
	originalOldString: string,
	correctedOldString: string,
	originalNewString: string,
	abortSignal: AbortSignal,
	): Promise<string> {
	if (originalOldString === correctedOldString) {
	return originalNewString;
	}

	const prompt = `
	Context: A text replacement operation was planned. The original text to be replaced (original_old_string) was slightly different from the actual text in the file (corrected_old_string). The original_old_string has now been corrected to match the file content.
	We now need to adjust the replacement text (original_new_string) so that it makes sense as a replacement for the corrected_old_string, while preserving the original intent of the change.

	original_old_string (what was initially intended to be found):
	\`\`\`
	${originalOldString}
	\`\`\`

	corrected_old_string (what was actually found in the file and will be replaced):
	\`\`\`
	${correctedOldString}
	\`\`\`

	original_new_string (what was intended to replace original_old_string):
	\`\`\`
	${originalNewString}
	\`\`\`

	Task: Based on the differences between original_old_string and corrected_old_string, and the content of original_new_string, generate a corrected_new_string. This corrected_new_string should be what original_new_string would have been if it was designed to replace corrected_old_string directly, while maintaining the spirit of the original transformation.

	For example, if original_old_string was "\\\\\\nconst greeting = \`Hello \\\\\`\${name}\\\\\`\`;" and corrected_old_string is "\nconst greeting = \`Hello ${'\\`'}\${name}${'\\`'}\`;", and original_new_string was "\\\\\\nconst greeting = \`Hello \\\\\`\${name} \${lastName}\\\\\`\`;", then corrected_new_string should likely be "\nconst greeting = \`Hello ${'\\`'}\${name} \${lastName}${'\\`'}\`;" to fix the incorrect escaping.
	If the differences are only in whitespace or formatting, apply similar whitespace/formatting changes to the corrected_new_string.

	Return ONLY the corrected string in the specified JSON format with the key 'corrected_new_string'. If no adjustment is deemed necessary or possible, return the original_new_string.
	`.trim();

	const contents: Content[] = [{ role: 'user', parts: [{ text: prompt }] }];

	try {
	const result = await geminiClient.generateJson(
	contents,
	NEW_STRING_CORRECTION_SCHEMA,
	abortSignal,
	EditModel,
	EditConfig,
	);

	if (
	result &&
	typeof result.corrected_new_string === 'string' &&
	result.corrected_new_string.length > 0
	) {
	return result.corrected_new_string;
	} else {
	return originalNewString;
	}
	} catch (error) {
	if (abortSignal.aborted) {
	throw error;
	}

	console.error('Error during LLM call for new_string correction:', error);
	return originalNewString;
	}
	}

	const CORRECT_NEW_STRING_ESCAPING_SCHEMA: SchemaUnion = {
	type: Type.OBJECT,
	properties: {
	corrected_new_string_escaping: {
	type: Type.STRING,
	description:
	'The new_string with corrected escaping, ensuring it is a proper replacement for the old_string, especially considering potential over-escaping issues from previous LLM generations.',
	},
	},
	required: ['corrected_new_string_escaping'],
	};

	export async function correctNewStringEscaping(
	geminiClient: GeminiClient,
	oldString: string,
	potentiallyProblematicNewString: string,
	abortSignal: AbortSignal,
	): Promise<string> {
	const prompt = `
	Context: A text replacement operation is planned. The text to be replaced (old_string) has been correctly identified in the file. However, the replacement text (new_string) might have been improperly escaped by a previous LLM generation (e.g. too many backslashes for newlines like \\n instead of \n, or unnecessarily quotes like \\"Hello\\" instead of "Hello").

	old_string (this is the exact text that will be replaced):
	\`\`\`
	${oldString}
	\`\`\`

	potentially_problematic_new_string (this is the text that should replace old_string, but MIGHT have bad escaping, or might be entirely correct):
	\`\`\`
	${potentiallyProblematicNewString}
	\`\`\`

	Task: Analyze the potentially_problematic_new_string. If it's syntactically invalid due to incorrect escaping (e.g., "\n", "\t", "\\", "\\'", "\\""), correct the invalid syntax. The goal is to ensure the new_string, when inserted into the code, will be a valid and correctly interpreted.

	For example, if old_string is "foo" and potentially_problematic_new_string is "bar\\nbaz", the corrected_new_string_escaping should be "bar\nbaz".
	If potentially_problematic_new_string is console.log(\\"Hello World\\"), it should be console.log("Hello World").

	Return ONLY the corrected string in the specified JSON format with the key 'corrected_new_string_escaping'. If no escaping correction is needed, return the original potentially_problematic_new_string.
	`.trim();

	const contents: Content[] = [{ role: 'user', parts: [{ text: prompt }] }];

	try {
	const result = await geminiClient.generateJson(
	contents,
	CORRECT_NEW_STRING_ESCAPING_SCHEMA,
	abortSignal,
	EditModel,
	EditConfig,
	);

	if (
	result &&
	typeof result.corrected_new_string_escaping === 'string' &&
	result.corrected_new_string_escaping.length > 0
	) {
	return result.corrected_new_string_escaping;
	} else {
	return potentiallyProblematicNewString;
	}
	} catch (error) {
	if (abortSignal.aborted) {
	throw error;
	}

	console.error(
	'Error during LLM call for new_string escaping correction:',
	error,
	);
	return potentiallyProblematicNewString;
	}
	}

	const CORRECT_STRING_ESCAPING_SCHEMA: SchemaUnion = {
	type: Type.OBJECT,
	properties: {
	corrected_string_escaping: {
	type: Type.STRING,
	description:
	'The string with corrected escaping, ensuring it is valid, specially considering potential over-escaping issues from previous LLM generations.',
	},
	},
	required: ['corrected_string_escaping'],
	};

	export async function correctStringEscaping(
	potentiallyProblematicString: string,
	client: GeminiClient,
	abortSignal: AbortSignal,
	): Promise<string> {
	const prompt = `
	Context: An LLM has just generated potentially_problematic_string and the text might have been improperly escaped (e.g. too many backslashes for newlines like \\n instead of \n, or unnecessarily quotes like \\"Hello\\" instead of "Hello").

	potentially_problematic_string (this text MIGHT have bad escaping, or might be entirely correct):
	\`\`\`
	${potentiallyProblematicString}
	\`\`\`

	Task: Analyze the potentially_problematic_string. If it's syntactically invalid due to incorrect escaping (e.g., "\n", "\t", "\\", "\\'", "\\""), correct the invalid syntax. The goal is to ensure the text will be a valid and correctly interpreted.

	For example, if potentially_problematic_string is "bar\\nbaz", the corrected_new_string_escaping should be "bar\nbaz".
	If potentially_problematic_string is console.log(\\"Hello World\\"), it should be console.log("Hello World").

	Return ONLY the corrected string in the specified JSON format with the key 'corrected_string_escaping'. If no escaping correction is needed, return the original potentially_problematic_string.
	`.trim();

	const contents: Content[] = [{ role: 'user', parts: [{ text: prompt }] }];

	try {
	const result = await client.generateJson(
	contents,
	CORRECT_STRING_ESCAPING_SCHEMA,
	abortSignal,
	EditModel,
	EditConfig,
	);

	if (
	result &&
	typeof result.corrected_string_escaping === 'string' &&
	result.corrected_string_escaping.length > 0
	) {
	return result.corrected_string_escaping;
	} else {
	return potentiallyProblematicString;
	}
	} catch (error) {
	if (abortSignal.aborted) {
	throw error;
	}

	console.error(
	'Error during LLM call for string escaping correction:',
	error,
	);
	return potentiallyProblematicString;
	}
	}

	function trimPairIfPossible(
	target: string,
	trimIfTargetTrims: string,
	currentContent: string,
	expectedReplacements: number,
	) {
	const trimmedTargetString = target.trim();
	if (target.length !== trimmedTargetString.length) {
	const trimmedTargetOccurrences = countOccurrences(
	currentContent,
	trimmedTargetString,
	);

	if (trimmedTargetOccurrences === expectedReplacements) {
	const trimmedReactiveString = trimIfTargetTrims.trim();
	return {
	targetString: trimmedTargetString,
	pair: trimmedReactiveString,
	};
	}
	}

	return {
	targetString: target,
	pair: trimIfTargetTrims,
	};
	}

	/**
	* Unescapes a string that might have been overly escaped by an LLM.
	*/
	export function unescapeStringForGeminiBug(inputString: string): string {
	// Regex explanation:
	// \\ : Matches exactly one literal backslash character.
	// (n\|t\|r\|'\|"\|`\|\\\|\n) : This is a capturing group. It matches one of the following:
	// n, t, r, ', ", ` : These match the literal characters 'n', 't', 'r', single quote, double quote, or backtick.
	// This handles cases like "\\n", "\\`", etc.
	// \\ : This matches a literal backslash. This handles cases like "\\\\" (escaped backslash).
	// \n : This matches an actual newline character. This handles cases where the input
	// string might have something like "\\\n" (a literal backslash followed by a newline).
	// g : Global flag, to replace all occurrences.

	return inputString.replace(
	/\\+(n\|t\|r\|'\|"\|`\|\\\|\n)/g,
	(match, capturedChar) => {
	// 'match' is the entire erroneous sequence, e.g., if the input (in memory) was "\\\\`", match is "\\\\`".
	// 'capturedChar' is the character that determines the true meaning, e.g., '`'.

	switch (capturedChar) {
	case 'n':
	return '\n'; // Correctly escaped: \n (newline character)
	case 't':
	return '\t'; // Correctly escaped: \t (tab character)
	case 'r':
	return '\r'; // Correctly escaped: \r (carriage return character)
	case "'":
	return "'"; // Correctly escaped: ' (apostrophe character)
	case '"':
	return '"'; // Correctly escaped: " (quotation mark character)
	case '`':
	return '`'; // Correctly escaped: ` (backtick character)
	case '\\': // This handles when 'capturedChar' is a literal backslash
	return '\\'; // Replace escaped backslash (e.g., "\\\\") with single backslash
	case '\n': // This handles when 'capturedChar' is an actual newline
	return '\n'; // Replace the whole erroneous sequence (e.g., "\\\n" in memory) with a clean newline
	default:
	// This fallback should ideally not be reached if the regex captures correctly.
	// It would return the original matched sequence if an unexpected character was captured.
	return match;
	}
	},
	);
	}

	/**
	* Counts occurrences of a substring in a string
	*/
	export function countOccurrences(str: string, substr: string): number {
	if (substr === '') {
	return 0;
	}
	let count = 0;
	let pos = str.indexOf(substr);
	while (pos !== -1) {
	count++;
	pos = str.indexOf(substr, pos + substr.length); // Start search after the current match
	}
	return count;
	}

	export function resetEditCorrectorCaches_TEST_ONLY() {
	editCorrectionCache.clear();
	fileContentCorrectionCache.clear();
	}