Upload webapp/src/tokenizer.ts with huggingface_hub

5a33bec verified 24 days ago

1.75 kB

	import { Tokenizer } from '@huggingface/tokenizers'

	export const SPACE_TOKEN = '[SPACE]'
	export const START_TEXT_TOKEN = 255
	export const STOP_TEXT_TOKEN = 0

	export type BrowserTokenizer = Tokenizer

	export function normalizeFinnishText(text: string): string {
	if (text.length === 0) {
	return 'You need to add some text for me to talk.'
	}

	let normalized = text
	if (normalized[0]?.toLowerCase() === normalized[0] && normalized[0] !== normalized[0]?.toUpperCase()) {
	normalized = normalized[0].toUpperCase() + normalized.slice(1)
	}

	normalized = normalized.split(/\s+/).join(' ')

	const punctuationReplacements: Array<[string, string]> = [
	['...', ', '],
	['…', ', '],
	[':', ','],
	[' - ', ', '],
	[';', ', '],
	['—', '-'],
	['–', '-'],
	[' ,', ','],
	['“', '"'],
	['”', '"'],
	['‘', "'"],
	['’', "'"],
	]

	for (const [from, to] of punctuationReplacements) {
	normalized = normalized.replaceAll(from, to)
	}

	normalized = normalized.trimEnd()
	const sentenceEnders = new Set(['.', '!', '?', '-', ','])
	if (!sentenceEnders.has(normalized.at(-1) ?? '')) {
	normalized += '.'
	}

	return normalized
	}

	export function createTokenizer(tokenizerJson: Record<string, unknown>, tokenizerConfig: Record<string, unknown> = {}): BrowserTokenizer {
	return new Tokenizer(tokenizerJson, tokenizerConfig)
	}

	export function encodeFinnishText(tokenizer: BrowserTokenizer, text: string): number[] {
	const prepared = normalizeFinnishText(text).replaceAll(' ', SPACE_TOKEN)
	return tokenizer.encode(prepared, { add_special_tokens: false }).ids
	}

	export function wrapTextTokens(tokenIds: number[]): number[] {
	return [START_TEXT_TOKEN, ...tokenIds, STOP_TEXT_TOKEN]
	}