import { Tokenizer } from '@huggingface/tokenizers' export const SPACE_TOKEN = '[SPACE]' export const START_TEXT_TOKEN = 255 export const STOP_TEXT_TOKEN = 0 export type BrowserTokenizer = Tokenizer export function normalizeFinnishText(text: string): string { if (text.length === 0) { return 'You need to add some text for me to talk.' } let normalized = text if (normalized[0]?.toLowerCase() === normalized[0] && normalized[0] !== normalized[0]?.toUpperCase()) { normalized = normalized[0].toUpperCase() + normalized.slice(1) } normalized = normalized.split(/\s+/).join(' ') const punctuationReplacements: Array<[string, string]> = [ ['...', ', '], ['…', ', '], [':', ','], [' - ', ', '], [';', ', '], ['—', '-'], ['–', '-'], [' ,', ','], ['“', '"'], ['”', '"'], ['‘', "'"], ['’', "'"], ] for (const [from, to] of punctuationReplacements) { normalized = normalized.replaceAll(from, to) } normalized = normalized.trimEnd() const sentenceEnders = new Set(['.', '!', '?', '-', ',']) if (!sentenceEnders.has(normalized.at(-1) ?? '')) { normalized += '.' } return normalized } export function createTokenizer(tokenizerJson: Record, tokenizerConfig: Record = {}): BrowserTokenizer { return new Tokenizer(tokenizerJson, tokenizerConfig) } export function encodeFinnishText(tokenizer: BrowserTokenizer, text: string): number[] { const prepared = normalizeFinnishText(text).replaceAll(' ', SPACE_TOKEN) return tokenizer.encode(prepared, { add_special_tokens: false }).ids } export function wrapTextTokens(tokenIds: number[]): number[] { return [START_TEXT_TOKEN, ...tokenIds, STOP_TEXT_TOKEN] }