File size: 1,749 Bytes
5a33bec | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 | import { Tokenizer } from '@huggingface/tokenizers'
export const SPACE_TOKEN = '[SPACE]'
export const START_TEXT_TOKEN = 255
export const STOP_TEXT_TOKEN = 0
export type BrowserTokenizer = Tokenizer
export function normalizeFinnishText(text: string): string {
if (text.length === 0) {
return 'You need to add some text for me to talk.'
}
let normalized = text
if (normalized[0]?.toLowerCase() === normalized[0] && normalized[0] !== normalized[0]?.toUpperCase()) {
normalized = normalized[0].toUpperCase() + normalized.slice(1)
}
normalized = normalized.split(/\s+/).join(' ')
const punctuationReplacements: Array<[string, string]> = [
['...', ', '],
['…', ', '],
[':', ','],
[' - ', ', '],
[';', ', '],
['—', '-'],
['–', '-'],
[' ,', ','],
['“', '"'],
['”', '"'],
['‘', "'"],
['’', "'"],
]
for (const [from, to] of punctuationReplacements) {
normalized = normalized.replaceAll(from, to)
}
normalized = normalized.trimEnd()
const sentenceEnders = new Set(['.', '!', '?', '-', ','])
if (!sentenceEnders.has(normalized.at(-1) ?? '')) {
normalized += '.'
}
return normalized
}
export function createTokenizer(tokenizerJson: Record<string, unknown>, tokenizerConfig: Record<string, unknown> = {}): BrowserTokenizer {
return new Tokenizer(tokenizerJson, tokenizerConfig)
}
export function encodeFinnishText(tokenizer: BrowserTokenizer, text: string): number[] {
const prepared = normalizeFinnishText(text).replaceAll(' ', SPACE_TOKEN)
return tokenizer.encode(prepared, { add_special_tokens: false }).ids
}
export function wrapTextTokens(tokenIds: number[]): number[] {
return [START_TEXT_TOKEN, ...tokenIds, STOP_TEXT_TOKEN]
}
|