Xenova's picture
Xenova HF Staff
Upload 18 files
2beb552 verified
raw
history blame
11.8 kB
/**
* Returns true if the character is considered a sentence terminator.
* This includes ASCII (".", "!", "?") and common Unicode terminators.
* NOTE: We also include newlines here, as this is favourable for text-to-speech systems.
* @param c The character to test.
* @param includeNewlines Whether to treat newlines as terminators.
*/
function isSentenceTerminator(c: string, includeNewlines: boolean = true): boolean {
return ".!?…。?!".includes(c) || (includeNewlines && c === "\n");
}
/**
* Returns true if the character should be attached to the sentence terminator,
* such as closing quotes or brackets.
* @param c The character to test.
*/
function isTrailingChar(c: string): boolean {
return "\"')]}」』".includes(c);
}
/**
* Extracts a token (a contiguous sequence of non–whitespace characters)
* from the buffer starting at the given index.
* @param buffer The input text.
* @param start The starting index.
* @returns The extracted token.
*/
function getTokenFromBuffer(buffer: string, start: number): string {
let end = start;
while (end < buffer.length && !/\s/.test(buffer[end])) {
++end;
}
return buffer.substring(start, end);
}
// List of common abbreviations. Note that strings with single letters joined by periods
// (e.g., "i.e", "e.g", "u.s.a", "u.s") are handled separately.
const ABBREVIATIONS: Set<string> = new Set([
"mr",
"mrs",
"ms",
"dr",
"prof",
"sr",
"jr",
"sgt",
"col",
"gen",
"rep",
"sen",
"gov",
"lt",
"maj",
"capt",
"st",
"mt",
"etc",
"co",
"inc",
"ltd",
"dept",
"vs",
"p",
"pg",
"jan",
"feb",
"mar",
"apr",
"jun",
"jul",
"aug",
"sep",
"sept",
"oct",
"nov",
"dec",
"sun",
"mon",
"tu",
"tue",
"tues",
"wed",
"th",
"thu",
"thur",
"thurs",
"fri",
"sat",
]);
/**
* Determines if the given token (or series of initials) is a known abbreviation.
* @param token The token to check.
*/
function isAbbreviation(token: string): boolean {
// Remove possessive endings and trailing periods.
token = token.replace(/['’]s$/i, "").replace(/\.+$/, "");
return ABBREVIATIONS.has(token.toLowerCase());
}
// Map of closing punctuation to their corresponding opening punctuation.
const MATCHING: Map<string, string> = new Map([
[")", "("],
["]", "["],
["}", "{"],
["》", "《"],
["〉", "〈"],
["›", "‹"],
["»", "«"],
["〉", "〈"],
["」", "「"],
["』", "『"],
["〕", "〔"],
["】", "【"],
]);
// Set of opening punctuation characters.
const OPENING: Set<string> = new Set(MATCHING.values());
/**
* Updates the nesting stack to track quotes and paired punctuation.
* This supports both standard (", ', (), [], {}) and Japanese quotes (「」「』『』).
* (An apostrophe between letters is ignored so that contractions remain intact.)
* @param c The current character.
* @param stack The current nesting stack.
* @param i The index of the character in the buffer.
* @param buffer The full text being processed.
*/
function updateStack(c: string, stack: string[], i: number, buffer: string): void {
// Handle standard quotes.
if (c === '"' || c === "'") {
// Ignore an apostrophe if it's between letters (e.g., in contractions).
if (
c === "'" &&
i > 0 &&
i < buffer.length - 1 &&
/[A-Za-z]/.test(buffer[i - 1]) &&
/[A-Za-z]/.test(buffer[i + 1])
) {
return;
}
// Ignore an apostrophe if it's at the end of a word (e.g., possessive "wives'").
if (c === "'" && i > 0 && /[A-Za-z]/.test(buffer[i - 1]) && (!stack.length || stack.at(-1) !== "'")) {
return;
}
// If the quote is already in the stack, it means we are closing it.
// We search from the top of the stack down.
const stackIndex = stack.lastIndexOf(c);
if (stackIndex !== -1) {
// We found the matching opening quote.
// If it's not at the top (e.g. stack is ['"', "'"] and c is '"'),
// it means the intermediate quotes (like the single quote) were likely
// apostrophes/contractions that were misidentified as opening quotes.
// We "close" them all by unwinding the stack to this point.
stack.splice(stackIndex);
} else {
stack.push(c);
}
return;
}
// Handle opening punctuation.
if (OPENING.has(c)) {
stack.push(c);
return;
}
// Handle closing punctuation.
const expectedOpening = MATCHING.get(c);
if (expectedOpening && stack.length && stack.at(-1) === expectedOpening) {
stack.pop();
}
}
/**
* A simple stream-based text splitter that emits complete sentences.
*/
export class TextSplitterStream implements AsyncIterable<string>, Iterable<string> {
private _buffer: string;
private _sentences: string[];
private _resolver: (() => void) | null;
private _closed: boolean;
constructor() {
this._buffer = "";
this._sentences = [];
this._resolver = null;
this._closed = false;
}
/**
* Push one or more text chunks into the stream.
* @param texts Text fragments to process.
*/
push(...texts: string[]): void {
for (const txt of texts) {
this._buffer += txt;
this._process();
}
}
/**
* Closes the stream, signaling that no more text will be pushed.
* This will flush any remaining text in the buffer as a sentence
* and allow the consuming process to finish processing the stream.
*/
close(): void {
if (this._closed) {
throw new Error("Stream is already closed.");
}
this._closed = true;
this.flush();
}
/**
* Flushes any remaining text in the buffer as a sentence.
*/
flush(): void {
const remainder = this._buffer.trim();
if (remainder.length > 0) {
this._sentences.push(remainder);
}
this._buffer = "";
this._resolve();
}
/**
* Resolve the pending promise to signal that sentences are available.
*/
private _resolve(): void {
if (this._resolver) {
this._resolver();
this._resolver = null;
}
}
/**
* Processes the internal buffer to extract complete sentences.
* If the potential sentence boundary is at the end of the current buffer,
* it waits for more text before splitting.
*/
private _process(): void {
let sentenceStart = 0;
const buffer = this._buffer;
const len = buffer.length;
let i = 0;
let stack: string[] = [];
// Helper to scan from the current index over trailing terminators and punctuation.
const scanBoundary = (idx: number): { end: number; nextNonSpace: number } => {
let end = idx;
// Consume contiguous sentence terminators (excluding newlines).
while (end + 1 < len && isSentenceTerminator(buffer[end + 1], false)) {
++end;
}
// Consume trailing characters (e.g., closing quotes/brackets).
while (end + 1 < len && isTrailingChar(buffer[end + 1])) {
++end;
}
let nextNonSpace = end + 1;
while (nextNonSpace < len && /\s/.test(buffer[nextNonSpace])) {
++nextNonSpace;
}
return { end, nextNonSpace };
};
while (i < len) {
const c = buffer[i];
updateStack(c, stack, i, buffer);
// Only consider splitting if we're not inside any nested structure.
if (stack.length === 0 && isSentenceTerminator(c)) {
const currentSegment = buffer.slice(sentenceStart, i);
// Skip splitting for likely numbered lists (e.g., "1." or "\n2.").
if (/(^|\n)\d+$/.test(currentSegment)) {
++i;
continue;
}
const { end: boundaryEnd, nextNonSpace } = scanBoundary(i);
// If the terminator is not a newline and there's no extra whitespace,
// we might be in the middle of a token (e.g., "$9.99"), so skip splitting.
if (i === nextNonSpace - 1 && c !== "\n") {
++i;
continue;
}
// Wait for more text if there's no non-whitespace character yet.
if (nextNonSpace === len) {
break;
}
// Determine the token immediately preceding the terminator.
let tokenStart = i - 1;
while (tokenStart >= 0 && /\S/.test(buffer[tokenStart])) {
tokenStart--;
}
tokenStart = Math.max(sentenceStart, tokenStart + 1);
const token = getTokenFromBuffer(buffer, tokenStart);
if (!token) {
++i;
continue;
}
// --- URL/email protection ---
// If the token appears to be a URL or email (contains "://" or "@")
// and does not already end with a terminator, skip splitting.
if (
(/https?[,:]\/\//.test(token) || token.includes("@")) &&
token.at(-1) &&
!isSentenceTerminator(token.at(-1)!)
) {
i = tokenStart + token.length;
continue;
}
// --- Abbreviation protection ---
if (isAbbreviation(token)) {
++i;
continue;
}
// --- Middle initials heuristic ---
// If the token is a series of single-letter initials (each ending in a period)
// and is followed by a capitalized word, assume it's part of a name.
if (/^([A-Za-z]\.)+$/.test(token) && nextNonSpace < len && /[A-Z]/.test(buffer[nextNonSpace])) {
++i;
continue;
}
// --- Lookahead heuristic ---
// If the terminator is a period and the next non–whitespace character is lowercase,
// assume it is not the end of a sentence.
if (c === "." && nextNonSpace < len && /[a-z]/.test(buffer[nextNonSpace])) {
++i;
continue;
}
// Special case: ellipsis that stands alone should be merged with the following sentence.
const sentence = buffer.substring(sentenceStart, boundaryEnd + 1).trim();
if (sentence === "..." || sentence === "…") {
++i;
continue;
}
// Accept the sentence boundary.
if (sentence) {
this._sentences.push(sentence);
}
// Move to the next sentence.
i = sentenceStart = boundaryEnd + 1;
continue;
}
++i;
}
// Remove the processed portion of the buffer.
this._buffer = buffer.substring(sentenceStart);
// Resolve any pending promise if sentences are available.
if (this._sentences.length > 0) {
this._resolve();
}
}
/**
* Async iterator to yield sentences as they become available.
*/
async *[Symbol.asyncIterator](): AsyncGenerator<string, void, void> {
if (this._resolver) {
throw new Error("Another iterator is already active.");
}
while (true) {
if (this._sentences.length > 0) {
// We use shift()! because we checked length > 0, so it cannot be undefined
yield this._sentences.shift()!;
} else if (this._closed) {
// No more text will be pushed.
break;
} else {
// Wait for more text.
await new Promise<void>((resolve) => {
this._resolver = resolve;
});
}
}
}
/**
* Synchronous iterator that flushes the buffer and returns all sentences.
*/
[Symbol.iterator](): Iterator<string> {
this.flush();
const iterator = this._sentences[Symbol.iterator]();
this._sentences = [];
return iterator;
}
/**
* Returns the array of sentences currently available.
*/
get sentences(): string[] {
return this._sentences;
}
}
/**
* Splits the input text into an array of sentences.
* @param text The text to split.
* @returns An array of sentences.
*/
export function split(text: string): string[] {
const splitter = new TextSplitterStream();
splitter.push(text);
return [...splitter];
}