Supertonic-TTS-WebGPU

No application file

App Files Files Community

Supertonic-TTS-WebGPU / src /splitter.ts

Xenova HF Staff

Upload 18 files

2beb552 verified 4 months ago

raw

history blame contribute delete

11.8 kB

	/**
	* Returns true if the character is considered a sentence terminator.
	* This includes ASCII (".", "!", "?") and common Unicode terminators.
	* NOTE: We also include newlines here, as this is favourable for text-to-speech systems.
	* @param c The character to test.
	* @param includeNewlines Whether to treat newlines as terminators.
	*/
	function isSentenceTerminator(c: string, includeNewlines: boolean = true): boolean {
	return ".!?…。？！".includes(c) \|\| (includeNewlines && c === "\n");
	}

	/**
	* Returns true if the character should be attached to the sentence terminator,
	* such as closing quotes or brackets.
	* @param c The character to test.
	*/
	function isTrailingChar(c: string): boolean {
	return "\"')]}」』".includes(c);
	}

	/**
	* Extracts a token (a contiguous sequence of non–whitespace characters)
	* from the buffer starting at the given index.
	* @param buffer The input text.
	* @param start The starting index.
	* @returns The extracted token.
	*/
	function getTokenFromBuffer(buffer: string, start: number): string {
	let end = start;
	while (end < buffer.length && !/\s/.test(buffer[end])) {
	++end;
	}
	return buffer.substring(start, end);
	}

	// List of common abbreviations. Note that strings with single letters joined by periods
	// (e.g., "i.e", "e.g", "u.s.a", "u.s") are handled separately.
	const ABBREVIATIONS: Set<string> = new Set([
	"mr",
	"mrs",
	"ms",
	"dr",
	"prof",
	"sr",
	"jr",
	"sgt",
	"col",
	"gen",
	"rep",
	"sen",
	"gov",
	"lt",
	"maj",
	"capt",
	"st",
	"mt",
	"etc",
	"co",
	"inc",
	"ltd",
	"dept",
	"vs",
	"p",
	"pg",
	"jan",
	"feb",
	"mar",
	"apr",
	"jun",
	"jul",
	"aug",
	"sep",
	"sept",
	"oct",
	"nov",
	"dec",
	"sun",
	"mon",
	"tu",
	"tue",
	"tues",
	"wed",
	"th",
	"thu",
	"thur",
	"thurs",
	"fri",
	"sat",
	]);

	/**
	* Determines if the given token (or series of initials) is a known abbreviation.
	* @param token The token to check.
	*/
	function isAbbreviation(token: string): boolean {
	// Remove possessive endings and trailing periods.
	token = token.replace(/['’]s$/i, "").replace(/\.+$/, "");
	return ABBREVIATIONS.has(token.toLowerCase());
	}

	// Map of closing punctuation to their corresponding opening punctuation.
	const MATCHING: Map<string, string> = new Map([
	[")", "("],
	["]", "["],
	["}", "{"],
	["》", "《"],
	["〉", "〈"],
	["›", "‹"],
	["»", "«"],
	["〉", "〈"],
	["」", "「"],
	["』", "『"],
	["〕", "〔"],
	["】", "【"],
	]);

	// Set of opening punctuation characters.
	const OPENING: Set<string> = new Set(MATCHING.values());

	/**
	* Updates the nesting stack to track quotes and paired punctuation.
	* This supports both standard (", ', (), [], {}) and Japanese quotes (「」「』『』).
	* (An apostrophe between letters is ignored so that contractions remain intact.)
	* @param c The current character.
	* @param stack The current nesting stack.
	* @param i The index of the character in the buffer.
	* @param buffer The full text being processed.
	*/
	function updateStack(c: string, stack: string[], i: number, buffer: string): void {
	// Handle standard quotes.
	if (c === '"' \|\| c === "'") {
	// Ignore an apostrophe if it's between letters (e.g., in contractions).
	if (
	c === "'" &&
	i > 0 &&
	i < buffer.length - 1 &&
	/[A-Za-z]/.test(buffer[i - 1]) &&
	/[A-Za-z]/.test(buffer[i + 1])
	) {
	return;
	}

	// Ignore an apostrophe if it's at the end of a word (e.g., possessive "wives'").
	if (c === "'" && i > 0 && /[A-Za-z]/.test(buffer[i - 1]) && (!stack.length \|\| stack.at(-1) !== "'")) {
	return;
	}

	// If the quote is already in the stack, it means we are closing it.
	// We search from the top of the stack down.
	const stackIndex = stack.lastIndexOf(c);
	if (stackIndex !== -1) {
	// We found the matching opening quote.
	// If it's not at the top (e.g. stack is ['"', "'"] and c is '"'),
	// it means the intermediate quotes (like the single quote) were likely
	// apostrophes/contractions that were misidentified as opening quotes.
	// We "close" them all by unwinding the stack to this point.
	stack.splice(stackIndex);
	} else {
	stack.push(c);
	}
	return;
	}
	// Handle opening punctuation.
	if (OPENING.has(c)) {
	stack.push(c);
	return;
	}
	// Handle closing punctuation.
	const expectedOpening = MATCHING.get(c);
	if (expectedOpening && stack.length && stack.at(-1) === expectedOpening) {
	stack.pop();
	}
	}

	/**
	* A simple stream-based text splitter that emits complete sentences.
	*/
	export class TextSplitterStream implements AsyncIterable<string>, Iterable<string> {
	private _buffer: string;
	private _sentences: string[];
	private _resolver: (() => void) \| null;
	private _closed: boolean;

	constructor() {
	this._buffer = "";
	this._sentences = [];
	this._resolver = null;
	this._closed = false;
	}

	/**
	* Push one or more text chunks into the stream.
	* @param texts Text fragments to process.
	*/
	push(...texts: string[]): void {
	for (const txt of texts) {
	this._buffer += txt;
	this._process();
	}
	}

	/**
	* Closes the stream, signaling that no more text will be pushed.
	* This will flush any remaining text in the buffer as a sentence
	* and allow the consuming process to finish processing the stream.
	*/
	close(): void {
	if (this._closed) {
	throw new Error("Stream is already closed.");
	}
	this._closed = true;
	this.flush();
	}

	/**
	* Flushes any remaining text in the buffer as a sentence.
	*/
	flush(): void {
	const remainder = this._buffer.trim();
	if (remainder.length > 0) {
	this._sentences.push(remainder);
	}
	this._buffer = "";
	this._resolve();
	}

	/**
	* Resolve the pending promise to signal that sentences are available.
	*/
	private _resolve(): void {
	if (this._resolver) {
	this._resolver();
	this._resolver = null;
	}
	}

	/**
	* Processes the internal buffer to extract complete sentences.
	* If the potential sentence boundary is at the end of the current buffer,
	* it waits for more text before splitting.
	*/
	private _process(): void {
	let sentenceStart = 0;
	const buffer = this._buffer;
	const len = buffer.length;
	let i = 0;
	let stack: string[] = [];

	// Helper to scan from the current index over trailing terminators and punctuation.
	const scanBoundary = (idx: number): { end: number; nextNonSpace: number } => {
	let end = idx;
	// Consume contiguous sentence terminators (excluding newlines).
	while (end + 1 < len && isSentenceTerminator(buffer[end + 1], false)) {
	++end;
	}
	// Consume trailing characters (e.g., closing quotes/brackets).
	while (end + 1 < len && isTrailingChar(buffer[end + 1])) {
	++end;
	}
	let nextNonSpace = end + 1;
	while (nextNonSpace < len && /\s/.test(buffer[nextNonSpace])) {
	++nextNonSpace;
	}
	return { end, nextNonSpace };
	};

	while (i < len) {
	const c = buffer[i];
	updateStack(c, stack, i, buffer);

	// Only consider splitting if we're not inside any nested structure.
	if (stack.length === 0 && isSentenceTerminator(c)) {
	const currentSegment = buffer.slice(sentenceStart, i);
	// Skip splitting for likely numbered lists (e.g., "1." or "\n2.").
	if (/(^\|\n)\d+$/.test(currentSegment)) {
	++i;
	continue;
	}

	const { end: boundaryEnd, nextNonSpace } = scanBoundary(i);

	// If the terminator is not a newline and there's no extra whitespace,
	// we might be in the middle of a token (e.g., "$9.99"), so skip splitting.
	if (i === nextNonSpace - 1 && c !== "\n") {
	++i;
	continue;
	}

	// Wait for more text if there's no non-whitespace character yet.
	if (nextNonSpace === len) {
	break;
	}

	// Determine the token immediately preceding the terminator.
	let tokenStart = i - 1;
	while (tokenStart >= 0 && /\S/.test(buffer[tokenStart])) {
	tokenStart--;
	}
	tokenStart = Math.max(sentenceStart, tokenStart + 1);
	const token = getTokenFromBuffer(buffer, tokenStart);
	if (!token) {
	++i;
	continue;
	}

	// --- URL/email protection ---
	// If the token appears to be a URL or email (contains "://" or "@")
	// and does not already end with a terminator, skip splitting.
	if (
	(/https?[,:]\/\//.test(token) \|\| token.includes("@")) &&
	token.at(-1) &&
	!isSentenceTerminator(token.at(-1)!)
	) {
	i = tokenStart + token.length;
	continue;
	}

	// --- Abbreviation protection ---
	if (isAbbreviation(token)) {
	++i;
	continue;
	}

	// --- Middle initials heuristic ---
	// If the token is a series of single-letter initials (each ending in a period)
	// and is followed by a capitalized word, assume it's part of a name.
	if (/^([A-Za-z]\.)+$/.test(token) && nextNonSpace < len && /[A-Z]/.test(buffer[nextNonSpace])) {
	++i;
	continue;
	}

	// --- Lookahead heuristic ---
	// If the terminator is a period and the next non–whitespace character is lowercase,
	// assume it is not the end of a sentence.
	if (c === "." && nextNonSpace < len && /[a-z]/.test(buffer[nextNonSpace])) {
	++i;
	continue;
	}

	// Special case: ellipsis that stands alone should be merged with the following sentence.
	const sentence = buffer.substring(sentenceStart, boundaryEnd + 1).trim();
	if (sentence === "..." \|\| sentence === "…") {
	++i;
	continue;
	}

	// Accept the sentence boundary.
	if (sentence) {
	this._sentences.push(sentence);
	}
	// Move to the next sentence.
	i = sentenceStart = boundaryEnd + 1;
	continue;
	}
	++i;
	}

	// Remove the processed portion of the buffer.
	this._buffer = buffer.substring(sentenceStart);

	// Resolve any pending promise if sentences are available.
	if (this._sentences.length > 0) {
	this._resolve();
	}
	}

	/**
	* Async iterator to yield sentences as they become available.
	*/
	async *[Symbol.asyncIterator](): AsyncGenerator<string, void, void> {
	if (this._resolver) {
	throw new Error("Another iterator is already active.");
	}
	while (true) {
	if (this._sentences.length > 0) {
	// We use shift()! because we checked length > 0, so it cannot be undefined
	yield this._sentences.shift()!;
	} else if (this._closed) {
	// No more text will be pushed.
	break;
	} else {
	// Wait for more text.
	await new Promise<void>((resolve) => {
	this._resolver = resolve;
	});
	}
	}
	}

	/**
	* Synchronous iterator that flushes the buffer and returns all sentences.
	*/
	[Symbol.iterator](): Iterator<string> {
	this.flush();
	const iterator = this._sentences[Symbol.iterator]();
	this._sentences = [];
	return iterator;
	}

	/**
	* Returns the array of sentences currently available.
	*/
	get sentences(): string[] {
	return this._sentences;
	}
	}

	/**
	* Splits the input text into an array of sentences.
	* @param text The text to split.
	* @returns An array of sentences.
	*/
	export function split(text: string): string[] {
	const splitter = new TextSplitterStream();
	splitter.push(text);
	return [...splitter];
	}