RASMUS's picture
Upload webapp/src/tokenizer.ts with huggingface_hub
5a33bec verified
import { Tokenizer } from '@huggingface/tokenizers'
export const SPACE_TOKEN = '[SPACE]'
export const START_TEXT_TOKEN = 255
export const STOP_TEXT_TOKEN = 0
export type BrowserTokenizer = Tokenizer
export function normalizeFinnishText(text: string): string {
if (text.length === 0) {
return 'You need to add some text for me to talk.'
}
let normalized = text
if (normalized[0]?.toLowerCase() === normalized[0] && normalized[0] !== normalized[0]?.toUpperCase()) {
normalized = normalized[0].toUpperCase() + normalized.slice(1)
}
normalized = normalized.split(/\s+/).join(' ')
const punctuationReplacements: Array<[string, string]> = [
['...', ', '],
['…', ', '],
[':', ','],
[' - ', ', '],
[';', ', '],
['β€”', '-'],
['–', '-'],
[' ,', ','],
['β€œ', '"'],
['”', '"'],
['β€˜', "'"],
['’', "'"],
]
for (const [from, to] of punctuationReplacements) {
normalized = normalized.replaceAll(from, to)
}
normalized = normalized.trimEnd()
const sentenceEnders = new Set(['.', '!', '?', '-', ','])
if (!sentenceEnders.has(normalized.at(-1) ?? '')) {
normalized += '.'
}
return normalized
}
export function createTokenizer(tokenizerJson: Record<string, unknown>, tokenizerConfig: Record<string, unknown> = {}): BrowserTokenizer {
return new Tokenizer(tokenizerJson, tokenizerConfig)
}
export function encodeFinnishText(tokenizer: BrowserTokenizer, text: string): number[] {
const prepared = normalizeFinnishText(text).replaceAll(' ', SPACE_TOKEN)
return tokenizer.encode(prepared, { add_special_tokens: false }).ids
}
export function wrapTextTokens(tokenIds: number[]): number[] {
return [START_TEXT_TOKEN, ...tokenIds, STOP_TEXT_TOKEN]
}