Spaces:
No application file
No application file
| /** | |
| * Returns true if the character is considered a sentence terminator. | |
| * This includes ASCII (".", "!", "?") and common Unicode terminators. | |
| * NOTE: We also include newlines here, as this is favourable for text-to-speech systems. | |
| * @param c The character to test. | |
| * @param includeNewlines Whether to treat newlines as terminators. | |
| */ | |
| function isSentenceTerminator(c: string, includeNewlines: boolean = true): boolean { | |
| return ".!?…。?!".includes(c) || (includeNewlines && c === "\n"); | |
| } | |
| /** | |
| * Returns true if the character should be attached to the sentence terminator, | |
| * such as closing quotes or brackets. | |
| * @param c The character to test. | |
| */ | |
| function isTrailingChar(c: string): boolean { | |
| return "\"')]}」』".includes(c); | |
| } | |
| /** | |
| * Extracts a token (a contiguous sequence of non–whitespace characters) | |
| * from the buffer starting at the given index. | |
| * @param buffer The input text. | |
| * @param start The starting index. | |
| * @returns The extracted token. | |
| */ | |
| function getTokenFromBuffer(buffer: string, start: number): string { | |
| let end = start; | |
| while (end < buffer.length && !/\s/.test(buffer[end])) { | |
| ++end; | |
| } | |
| return buffer.substring(start, end); | |
| } | |
| // List of common abbreviations. Note that strings with single letters joined by periods | |
| // (e.g., "i.e", "e.g", "u.s.a", "u.s") are handled separately. | |
| const ABBREVIATIONS: Set<string> = new Set([ | |
| "mr", | |
| "mrs", | |
| "ms", | |
| "dr", | |
| "prof", | |
| "sr", | |
| "jr", | |
| "sgt", | |
| "col", | |
| "gen", | |
| "rep", | |
| "sen", | |
| "gov", | |
| "lt", | |
| "maj", | |
| "capt", | |
| "st", | |
| "mt", | |
| "etc", | |
| "co", | |
| "inc", | |
| "ltd", | |
| "dept", | |
| "vs", | |
| "p", | |
| "pg", | |
| "jan", | |
| "feb", | |
| "mar", | |
| "apr", | |
| "jun", | |
| "jul", | |
| "aug", | |
| "sep", | |
| "sept", | |
| "oct", | |
| "nov", | |
| "dec", | |
| "sun", | |
| "mon", | |
| "tu", | |
| "tue", | |
| "tues", | |
| "wed", | |
| "th", | |
| "thu", | |
| "thur", | |
| "thurs", | |
| "fri", | |
| "sat", | |
| ]); | |
| /** | |
| * Determines if the given token (or series of initials) is a known abbreviation. | |
| * @param token The token to check. | |
| */ | |
| function isAbbreviation(token: string): boolean { | |
| // Remove possessive endings and trailing periods. | |
| token = token.replace(/['’]s$/i, "").replace(/\.+$/, ""); | |
| return ABBREVIATIONS.has(token.toLowerCase()); | |
| } | |
| // Map of closing punctuation to their corresponding opening punctuation. | |
| const MATCHING: Map<string, string> = new Map([ | |
| [")", "("], | |
| ["]", "["], | |
| ["}", "{"], | |
| ["》", "《"], | |
| ["〉", "〈"], | |
| ["›", "‹"], | |
| ["»", "«"], | |
| ["〉", "〈"], | |
| ["」", "「"], | |
| ["』", "『"], | |
| ["〕", "〔"], | |
| ["】", "【"], | |
| ]); | |
| // Set of opening punctuation characters. | |
| const OPENING: Set<string> = new Set(MATCHING.values()); | |
| /** | |
| * Updates the nesting stack to track quotes and paired punctuation. | |
| * This supports both standard (", ', (), [], {}) and Japanese quotes (「」「』『』). | |
| * (An apostrophe between letters is ignored so that contractions remain intact.) | |
| * @param c The current character. | |
| * @param stack The current nesting stack. | |
| * @param i The index of the character in the buffer. | |
| * @param buffer The full text being processed. | |
| */ | |
| function updateStack(c: string, stack: string[], i: number, buffer: string): void { | |
| // Handle standard quotes. | |
| if (c === '"' || c === "'") { | |
| // Ignore an apostrophe if it's between letters (e.g., in contractions). | |
| if ( | |
| c === "'" && | |
| i > 0 && | |
| i < buffer.length - 1 && | |
| /[A-Za-z]/.test(buffer[i - 1]) && | |
| /[A-Za-z]/.test(buffer[i + 1]) | |
| ) { | |
| return; | |
| } | |
| // Ignore an apostrophe if it's at the end of a word (e.g., possessive "wives'"). | |
| if (c === "'" && i > 0 && /[A-Za-z]/.test(buffer[i - 1]) && (!stack.length || stack.at(-1) !== "'")) { | |
| return; | |
| } | |
| // If the quote is already in the stack, it means we are closing it. | |
| // We search from the top of the stack down. | |
| const stackIndex = stack.lastIndexOf(c); | |
| if (stackIndex !== -1) { | |
| // We found the matching opening quote. | |
| // If it's not at the top (e.g. stack is ['"', "'"] and c is '"'), | |
| // it means the intermediate quotes (like the single quote) were likely | |
| // apostrophes/contractions that were misidentified as opening quotes. | |
| // We "close" them all by unwinding the stack to this point. | |
| stack.splice(stackIndex); | |
| } else { | |
| stack.push(c); | |
| } | |
| return; | |
| } | |
| // Handle opening punctuation. | |
| if (OPENING.has(c)) { | |
| stack.push(c); | |
| return; | |
| } | |
| // Handle closing punctuation. | |
| const expectedOpening = MATCHING.get(c); | |
| if (expectedOpening && stack.length && stack.at(-1) === expectedOpening) { | |
| stack.pop(); | |
| } | |
| } | |
| /** | |
| * A simple stream-based text splitter that emits complete sentences. | |
| */ | |
| export class TextSplitterStream implements AsyncIterable<string>, Iterable<string> { | |
| private _buffer: string; | |
| private _sentences: string[]; | |
| private _resolver: (() => void) | null; | |
| private _closed: boolean; | |
| constructor() { | |
| this._buffer = ""; | |
| this._sentences = []; | |
| this._resolver = null; | |
| this._closed = false; | |
| } | |
| /** | |
| * Push one or more text chunks into the stream. | |
| * @param texts Text fragments to process. | |
| */ | |
| push(...texts: string[]): void { | |
| for (const txt of texts) { | |
| this._buffer += txt; | |
| this._process(); | |
| } | |
| } | |
| /** | |
| * Closes the stream, signaling that no more text will be pushed. | |
| * This will flush any remaining text in the buffer as a sentence | |
| * and allow the consuming process to finish processing the stream. | |
| */ | |
| close(): void { | |
| if (this._closed) { | |
| throw new Error("Stream is already closed."); | |
| } | |
| this._closed = true; | |
| this.flush(); | |
| } | |
| /** | |
| * Flushes any remaining text in the buffer as a sentence. | |
| */ | |
| flush(): void { | |
| const remainder = this._buffer.trim(); | |
| if (remainder.length > 0) { | |
| this._sentences.push(remainder); | |
| } | |
| this._buffer = ""; | |
| this._resolve(); | |
| } | |
| /** | |
| * Resolve the pending promise to signal that sentences are available. | |
| */ | |
| private _resolve(): void { | |
| if (this._resolver) { | |
| this._resolver(); | |
| this._resolver = null; | |
| } | |
| } | |
| /** | |
| * Processes the internal buffer to extract complete sentences. | |
| * If the potential sentence boundary is at the end of the current buffer, | |
| * it waits for more text before splitting. | |
| */ | |
| private _process(): void { | |
| let sentenceStart = 0; | |
| const buffer = this._buffer; | |
| const len = buffer.length; | |
| let i = 0; | |
| let stack: string[] = []; | |
| // Helper to scan from the current index over trailing terminators and punctuation. | |
| const scanBoundary = (idx: number): { end: number; nextNonSpace: number } => { | |
| let end = idx; | |
| // Consume contiguous sentence terminators (excluding newlines). | |
| while (end + 1 < len && isSentenceTerminator(buffer[end + 1], false)) { | |
| ++end; | |
| } | |
| // Consume trailing characters (e.g., closing quotes/brackets). | |
| while (end + 1 < len && isTrailingChar(buffer[end + 1])) { | |
| ++end; | |
| } | |
| let nextNonSpace = end + 1; | |
| while (nextNonSpace < len && /\s/.test(buffer[nextNonSpace])) { | |
| ++nextNonSpace; | |
| } | |
| return { end, nextNonSpace }; | |
| }; | |
| while (i < len) { | |
| const c = buffer[i]; | |
| updateStack(c, stack, i, buffer); | |
| // Only consider splitting if we're not inside any nested structure. | |
| if (stack.length === 0 && isSentenceTerminator(c)) { | |
| const currentSegment = buffer.slice(sentenceStart, i); | |
| // Skip splitting for likely numbered lists (e.g., "1." or "\n2."). | |
| if (/(^|\n)\d+$/.test(currentSegment)) { | |
| ++i; | |
| continue; | |
| } | |
| const { end: boundaryEnd, nextNonSpace } = scanBoundary(i); | |
| // If the terminator is not a newline and there's no extra whitespace, | |
| // we might be in the middle of a token (e.g., "$9.99"), so skip splitting. | |
| if (i === nextNonSpace - 1 && c !== "\n") { | |
| ++i; | |
| continue; | |
| } | |
| // Wait for more text if there's no non-whitespace character yet. | |
| if (nextNonSpace === len) { | |
| break; | |
| } | |
| // Determine the token immediately preceding the terminator. | |
| let tokenStart = i - 1; | |
| while (tokenStart >= 0 && /\S/.test(buffer[tokenStart])) { | |
| tokenStart--; | |
| } | |
| tokenStart = Math.max(sentenceStart, tokenStart + 1); | |
| const token = getTokenFromBuffer(buffer, tokenStart); | |
| if (!token) { | |
| ++i; | |
| continue; | |
| } | |
| // --- URL/email protection --- | |
| // If the token appears to be a URL or email (contains "://" or "@") | |
| // and does not already end with a terminator, skip splitting. | |
| if ( | |
| (/https?[,:]\/\//.test(token) || token.includes("@")) && | |
| token.at(-1) && | |
| !isSentenceTerminator(token.at(-1)!) | |
| ) { | |
| i = tokenStart + token.length; | |
| continue; | |
| } | |
| // --- Abbreviation protection --- | |
| if (isAbbreviation(token)) { | |
| ++i; | |
| continue; | |
| } | |
| // --- Middle initials heuristic --- | |
| // If the token is a series of single-letter initials (each ending in a period) | |
| // and is followed by a capitalized word, assume it's part of a name. | |
| if (/^([A-Za-z]\.)+$/.test(token) && nextNonSpace < len && /[A-Z]/.test(buffer[nextNonSpace])) { | |
| ++i; | |
| continue; | |
| } | |
| // --- Lookahead heuristic --- | |
| // If the terminator is a period and the next non–whitespace character is lowercase, | |
| // assume it is not the end of a sentence. | |
| if (c === "." && nextNonSpace < len && /[a-z]/.test(buffer[nextNonSpace])) { | |
| ++i; | |
| continue; | |
| } | |
| // Special case: ellipsis that stands alone should be merged with the following sentence. | |
| const sentence = buffer.substring(sentenceStart, boundaryEnd + 1).trim(); | |
| if (sentence === "..." || sentence === "…") { | |
| ++i; | |
| continue; | |
| } | |
| // Accept the sentence boundary. | |
| if (sentence) { | |
| this._sentences.push(sentence); | |
| } | |
| // Move to the next sentence. | |
| i = sentenceStart = boundaryEnd + 1; | |
| continue; | |
| } | |
| ++i; | |
| } | |
| // Remove the processed portion of the buffer. | |
| this._buffer = buffer.substring(sentenceStart); | |
| // Resolve any pending promise if sentences are available. | |
| if (this._sentences.length > 0) { | |
| this._resolve(); | |
| } | |
| } | |
| /** | |
| * Async iterator to yield sentences as they become available. | |
| */ | |
| async *[Symbol.asyncIterator](): AsyncGenerator<string, void, void> { | |
| if (this._resolver) { | |
| throw new Error("Another iterator is already active."); | |
| } | |
| while (true) { | |
| if (this._sentences.length > 0) { | |
| // We use shift()! because we checked length > 0, so it cannot be undefined | |
| yield this._sentences.shift()!; | |
| } else if (this._closed) { | |
| // No more text will be pushed. | |
| break; | |
| } else { | |
| // Wait for more text. | |
| await new Promise<void>((resolve) => { | |
| this._resolver = resolve; | |
| }); | |
| } | |
| } | |
| } | |
| /** | |
| * Synchronous iterator that flushes the buffer and returns all sentences. | |
| */ | |
| [Symbol.iterator](): Iterator<string> { | |
| this.flush(); | |
| const iterator = this._sentences[Symbol.iterator](); | |
| this._sentences = []; | |
| return iterator; | |
| } | |
| /** | |
| * Returns the array of sentences currently available. | |
| */ | |
| get sentences(): string[] { | |
| return this._sentences; | |
| } | |
| } | |
| /** | |
| * Splits the input text into an array of sentences. | |
| * @param text The text to split. | |
| * @returns An array of sentences. | |
| */ | |
| export function split(text: string): string[] { | |
| const splitter = new TextSplitterStream(); | |
| splitter.push(text); | |
| return [...splitter]; | |
| } | |