Chatterbox-Finnish-ONNX / webapp /src /tokenizer.test.ts
RASMUS's picture
Upload webapp/src/tokenizer.test.ts with huggingface_hub
e6863e4 verified
import { readFile } from 'node:fs/promises'
import { resolve } from 'node:path'
import { describe, expect, it } from 'vitest'
import { createTokenizer, encodeFinnishText, normalizeFinnishText } from './tokenizer'
const SAMPLE_TEXT =
'Tervetuloa kokeilemaan hienoviritettyä suomenkielistä Chatterbox-puhesynteesiä.'
const EXPECTED_IDS = [
296, 44, 76, 33, 34, 99, 14, 2, 24, 28, 93, 22, 64, 26, 14, 43, 2, 21, 22, 50,
28, 35, 98, 60, 18, 33, 210, 397, 2, 133, 65, 50, 24, 236, 25, 54, 33, 397, 2,
279, 21, 48, 114, 165, 37, 8, 29, 34, 21, 61, 38, 27, 136, 61, 22, 397, 9,
]
describe('normalizeFinnishText', () => {
it('mirrors the Python punctuation normalization rules', () => {
expect(normalizeFinnishText('terve maailma')).toBe('Terve maailma.')
expect(normalizeFinnishText('Hei…')).toBe('Hei,')
})
})
describe('encodeFinnishText', () => {
it('reproduces the expected Finnish token ids from tokenizer.json', async () => {
const tokenizerPath = resolve(
process.cwd(),
'..',
'Chatterbox-Finnish',
'pretrained_models',
'tokenizer.json',
)
const tokenizerJson = JSON.parse(await readFile(tokenizerPath, 'utf8'))
const tokenizer = createTokenizer(tokenizerJson)
expect(encodeFinnishText(tokenizer, SAMPLE_TEXT)).toEqual(EXPECTED_IDS)
})
})