Spaces:
Running
Running
| /** | |
| * | |
| * Copyright 2023-2025 InspectorRAGet Team | |
| * | |
| * Licensed under the Apache License, Version 2.0 (the "License"); | |
| * you may not use this file except in compliance with the License. | |
| * You may obtain a copy of the License at | |
| * | |
| * http://www.apache.org/licenses/LICENSE-2.0 | |
| * | |
| * Unless required by applicable law or agreed to in writing, software | |
| * distributed under the License is distributed on an "AS IS" BASIS, | |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| * See the License for the specific language governing permissions and | |
| * limitations under the License. | |
| * | |
| **/ | |
| import { StringMatchObject } from '@/src/types'; | |
| const crypto = require('crypto'); | |
| export function truncate(text: string, length: number): string { | |
| if (text.length > length) { | |
| return text.slice(0, length) + ' ...'; | |
| } | |
| return text; | |
| } | |
| export function hash(text: string): string { | |
| return crypto.createHash('md5').update(text).digest('hex'); | |
| } | |
| /** | |
| * Normalize text (e.g., characters used for quotes). Used to improve matching. | |
| * @param text | |
| * @returns | |
| */ | |
| function normalize(text) { | |
| var normalizedText = text; | |
| // normalize double and single quotes | |
| normalizedText = text.replace(/[ββ]/g, '"'); | |
| normalizedText = normalizedText.replace(/[ββ]/g, "'"); | |
| return normalizedText; | |
| } | |
| /** | |
| * Helper functions to identify token boundaries | |
| * getNextTokenStart: Identifies start of next token | |
| * getNextTokenEnd: Identified end of next token | |
| */ | |
| /** | |
| * Identify start of next token | |
| * @param text | |
| * @param offset starting offset in the text | |
| * @returns starting position index of next token | |
| */ | |
| function getNextTokenStart(text: string, offset: number = 0): number { | |
| // Step 1: Set starting index to provided offset | |
| var startIndex = offset; | |
| // Step 2: Skip over non-alphanumeric characters at the start | |
| while (startIndex < text.length && /\W/.test(text.charAt(startIndex))) { | |
| startIndex++; | |
| } | |
| // Step 3: Return | |
| return startIndex; | |
| } | |
| /** | |
| * Identify end of next token | |
| * @param text | |
| * @param offset starting offset in the text | |
| * @returns ending position index of next token | |
| */ | |
| function getNextTokenEnd(text: string, offset: number = 0): number { | |
| // Step 1: Set end index to be starting index of next token | |
| var endIndex = getNextTokenStart(text, offset); | |
| // Step 2: Include alphanumeric characters until the first non-alphanumeric character is found | |
| while (endIndex < text.length && !/\W/.test(text.charAt(endIndex))) { | |
| endIndex++; | |
| } | |
| return endIndex; | |
| } | |
| /** | |
| * Create regular expression based on string | |
| * @param text regular expression string | |
| * @returns | |
| */ | |
| function createRegex(text: string): RegExp { | |
| // Escape regular expression characters | |
| const escapedText = text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); | |
| return new RegExp(escapedText, 'g'); | |
| } | |
| /** | |
| * Find matches in the text based on query using regular expression | |
| * @param query string to find | |
| * @param text | |
| * @returns | |
| */ | |
| function match(query: string, text: string) { | |
| // Step 1: Create regular expression | |
| const regex = createRegex(query); | |
| // Step 2: Find matches | |
| const matches: { readonly start: number; readonly end: number }[] = []; | |
| let match; | |
| while ((match = regex.exec(text)) !== null) { | |
| matches.push({ | |
| start: match.index, | |
| end: regex.lastIndex, | |
| }); | |
| } | |
| // Step 3: Return | |
| return matches; | |
| } | |
| /** | |
| * Find overlap based on matching tokens between source and target | |
| * @param source string from which tokens are used to find overlap | |
| * @param target string in which overlaps are found | |
| * @param min_match_tokens | |
| * @returns | |
| */ | |
| export function overlaps( | |
| source: string, | |
| target: string, | |
| min_match_tokens: number = 3, | |
| ) { | |
| // Step 1: Normalize source and target text | |
| const normalizedSource = normalize(source).toLowerCase(); | |
| const normalizedTarget = normalize(target).toLowerCase(); | |
| // Step 2: Define necessary variables | |
| const matches: StringMatchObject[] = []; | |
| // Step 3: Find matches | |
| // Step 3.a: Identify starting position for next token in the source and set current end position to same starting position | |
| let curStartPos = getNextTokenStart(normalizedSource, 0); | |
| // Step 3.b: Keep finding next starting position till all tokens in the source are seen | |
| while (curStartPos < normalizedSource.length) { | |
| let curEndPos = curStartPos; | |
| let matchTokenLength = 0; | |
| let substringEndPos = curStartPos; | |
| // Step 3.b.i: Identify next minimum match tokens | |
| while (matchTokenLength < min_match_tokens - 1) { | |
| substringEndPos = getNextTokenEnd(normalizedSource, substringEndPos); | |
| matchTokenLength++; | |
| } | |
| // Step 3.b.ii: | |
| do { | |
| // Update temporary end position | |
| substringEndPos = getNextTokenEnd(normalizedSource, substringEndPos); | |
| // Find matches for the source substring in the target | |
| var matchesInTarget = normalizedTarget.match( | |
| createRegex(normalizedSource.substring(curStartPos, substringEndPos)), | |
| ); | |
| if (matchesInTarget != null) { | |
| curEndPos = substringEndPos; | |
| } | |
| } while (matchesInTarget != null && curEndPos < normalizedSource.length); | |
| if (curEndPos !== curStartPos) { | |
| const localMatches: { start: number; end: number }[] = match( | |
| normalizedSource.substring(curStartPos, curEndPos), | |
| normalizedTarget, | |
| ); | |
| matches.push({ | |
| start: curStartPos, | |
| end: curEndPos, | |
| text: normalizedSource.substring(curStartPos, curEndPos), | |
| matchesInTarget: localMatches, | |
| count: localMatches.length, | |
| }); | |
| // Set current starting position to next token in the source past current ending position | |
| curStartPos = getNextTokenStart(normalizedSource, curEndPos); | |
| } else { | |
| // Set current starting position to next token in the source past current ending position | |
| curStartPos = getNextTokenStart( | |
| normalizedSource, | |
| getNextTokenEnd(normalizedSource, curEndPos), | |
| ); | |
| } | |
| } | |
| return matches; | |
| } | |