| import { Tokenizer } from '@huggingface/tokenizers' |
|
|
| export const SPACE_TOKEN = '[SPACE]' |
| export const START_TEXT_TOKEN = 255 |
| export const STOP_TEXT_TOKEN = 0 |
|
|
| export type BrowserTokenizer = Tokenizer |
|
|
| export function normalizeFinnishText(text: string): string { |
| if (text.length === 0) { |
| return 'You need to add some text for me to talk.' |
| } |
|
|
| let normalized = text |
| if (normalized[0]?.toLowerCase() === normalized[0] && normalized[0] !== normalized[0]?.toUpperCase()) { |
| normalized = normalized[0].toUpperCase() + normalized.slice(1) |
| } |
|
|
| normalized = normalized.split(/\s+/).join(' ') |
|
|
| const punctuationReplacements: Array<[string, string]> = [ |
| ['...', ', '], |
| ['β¦', ', '], |
| [':', ','], |
| [' - ', ', '], |
| [';', ', '], |
| ['β', '-'], |
| ['β', '-'], |
| [' ,', ','], |
| ['β', '"'], |
| ['β', '"'], |
| ['β', "'"], |
| ['β', "'"], |
| ] |
|
|
| for (const [from, to] of punctuationReplacements) { |
| normalized = normalized.replaceAll(from, to) |
| } |
|
|
| normalized = normalized.trimEnd() |
| const sentenceEnders = new Set(['.', '!', '?', '-', ',']) |
| if (!sentenceEnders.has(normalized.at(-1) ?? '')) { |
| normalized += '.' |
| } |
|
|
| return normalized |
| } |
|
|
| export function createTokenizer(tokenizerJson: Record<string, unknown>, tokenizerConfig: Record<string, unknown> = {}): BrowserTokenizer { |
| return new Tokenizer(tokenizerJson, tokenizerConfig) |
| } |
|
|
| export function encodeFinnishText(tokenizer: BrowserTokenizer, text: string): number[] { |
| const prepared = normalizeFinnishText(text).replaceAll(' ', SPACE_TOKEN) |
| return tokenizer.encode(prepared, { add_special_tokens: false }).ids |
| } |
|
|
| export function wrapTextTokens(tokenIds: number[]): number[] { |
| return [START_TEXT_TOKEN, ...tokenIds, STOP_TEXT_TOKEN] |
| } |
|
|