Spaces:

shreyask
/

bol-tts-marathi

Running

App Files Files Community

bol-tts-marathi / src /tokenize.ts

shreyask

release: webgpu demo + source

2bb8806 verified 30 days ago

raw

history blame contribute delete

2.38 kB

	import type { KokoroVocab } from "./model";

	/**
	* Result of turning text → model input, with word-level bookkeeping so we can
	* align per-phoneme `pred_dur` back to words for timestamps.
	*/
	export interface TokenizeResult {
	inputIds: number[]; // token IDs for KModel
	phonemes: string[]; // flat phoneme sequence (each ID's char)
	words: Array<{
	word: string; // Devanagari source
	ipa: string; // raw espeak-ng output for this word
	tokenStart: number; // index into inputIds for first kept phoneme
	tokenEnd: number; // exclusive
	}>;
	droppedChars: Record<string, number>; // chars not in vocab, counts for diagnostics
	}

	/**
	* Convert per-word IPA (from phonemize.ts) into model-ready input_ids, keeping
	* track of which input_ids came from which word for downstream timestamping.
	*
	* Characters not in `vocab` are silently dropped — this mirrors KModel.forward's
	* own tokenization (`input_ids = filter(lambda i: i is not None, map(vocab.get, phonemes))`).
	*/
	export function tokenizeByWord(
	wordIpas: Array<{ word: string; ipa: string }>,
	vocab: KokoroVocab,
	): TokenizeResult {
	const inputIds: number[] = [];
	const phonemes: string[] = [];
	const words: TokenizeResult["words"] = [];
	const droppedChars: Record<string, number> = {};

	const spaceId = vocab[" "];
	for (let wi = 0; wi < wordIpas.length; wi++) {
	const { word, ipa } = wordIpas[wi];
	// Insert word-boundary space BEFORE every word except the first.
	// (Trailing-space-after-last-word tends to get stripped by the predictor,
	// which throws off pred_dur ↔ input_ids alignment and drops the last
	// word from our timestamps. Leading-space avoids the issue.)
	if (wi > 0 && spaceId !== undefined) {
	inputIds.push(spaceId);
	phonemes.push(" ");
	}
	const tokenStart = inputIds.length;
	for (const c of ipa) {
	const id = vocab[c];
	if (id === undefined) {
	droppedChars[c] = (droppedChars[c] ?? 0) + 1;
	} else {
	inputIds.push(id);
	phonemes.push(c);
	}
	}
	const tokenEnd = inputIds.length;
	words.push({ word, ipa, tokenStart, tokenEnd });
	}

	return { inputIds, phonemes, words, droppedChars };
	}