Upload webapp/src/tokenizer.test.ts with huggingface_hub

e6863e4 verified 5 days ago

1.33 kB

	import { readFile } from 'node:fs/promises'
	import { resolve } from 'node:path'
	import { describe, expect, it } from 'vitest'

	import { createTokenizer, encodeFinnishText, normalizeFinnishText } from './tokenizer'

	const SAMPLE_TEXT =
	'Tervetuloa kokeilemaan hienoviritettyä suomenkielistä Chatterbox-puhesynteesiä.'

	const EXPECTED_IDS = [
	296, 44, 76, 33, 34, 99, 14, 2, 24, 28, 93, 22, 64, 26, 14, 43, 2, 21, 22, 50,
	28, 35, 98, 60, 18, 33, 210, 397, 2, 133, 65, 50, 24, 236, 25, 54, 33, 397, 2,
	279, 21, 48, 114, 165, 37, 8, 29, 34, 21, 61, 38, 27, 136, 61, 22, 397, 9,
	]

	describe('normalizeFinnishText', () => {
	it('mirrors the Python punctuation normalization rules', () => {
	expect(normalizeFinnishText('terve maailma')).toBe('Terve maailma.')
	expect(normalizeFinnishText('Hei…')).toBe('Hei,')
	})
	})

	describe('encodeFinnishText', () => {
	it('reproduces the expected Finnish token ids from tokenizer.json', async () => {
	const tokenizerPath = resolve(
	process.cwd(),
	'..',
	'Chatterbox-Finnish',
	'pretrained_models',
	'tokenizer.json',
	)
	const tokenizerJson = JSON.parse(await readFile(tokenizerPath, 'utf8'))
	const tokenizer = createTokenizer(tokenizerJson)

	expect(encodeFinnishText(tokenizer, SAMPLE_TEXT)).toEqual(EXPECTED_IDS)
	})
	})