import { readFile } from 'node:fs/promises' import { resolve } from 'node:path' import { describe, expect, it } from 'vitest' import { createTokenizer, encodeFinnishText, normalizeFinnishText } from './tokenizer' const SAMPLE_TEXT = 'Tervetuloa kokeilemaan hienoviritettyä suomenkielistä Chatterbox-puhesynteesiä.' const EXPECTED_IDS = [ 296, 44, 76, 33, 34, 99, 14, 2, 24, 28, 93, 22, 64, 26, 14, 43, 2, 21, 22, 50, 28, 35, 98, 60, 18, 33, 210, 397, 2, 133, 65, 50, 24, 236, 25, 54, 33, 397, 2, 279, 21, 48, 114, 165, 37, 8, 29, 34, 21, 61, 38, 27, 136, 61, 22, 397, 9, ] describe('normalizeFinnishText', () => { it('mirrors the Python punctuation normalization rules', () => { expect(normalizeFinnishText('terve maailma')).toBe('Terve maailma.') expect(normalizeFinnishText('Hei…')).toBe('Hei,') }) }) describe('encodeFinnishText', () => { it('reproduces the expected Finnish token ids from tokenizer.json', async () => { const tokenizerPath = resolve( process.cwd(), '..', 'Chatterbox-Finnish', 'pretrained_models', 'tokenizer.json', ) const tokenizerJson = JSON.parse(await readFile(tokenizerPath, 'utf8')) const tokenizer = createTokenizer(tokenizerJson) expect(encodeFinnishText(tokenizer, SAMPLE_TEXT)).toEqual(EXPECTED_IDS) }) })