| import { readFile } from 'node:fs/promises' |
| import { resolve } from 'node:path' |
| import { describe, expect, it } from 'vitest' |
|
|
| import { createTokenizer, encodeFinnishText, normalizeFinnishText } from './tokenizer' |
|
|
| const SAMPLE_TEXT = |
| 'Tervetuloa kokeilemaan hienoviritettyä suomenkielistä Chatterbox-puhesynteesiä.' |
|
|
| const EXPECTED_IDS = [ |
| 296, 44, 76, 33, 34, 99, 14, 2, 24, 28, 93, 22, 64, 26, 14, 43, 2, 21, 22, 50, |
| 28, 35, 98, 60, 18, 33, 210, 397, 2, 133, 65, 50, 24, 236, 25, 54, 33, 397, 2, |
| 279, 21, 48, 114, 165, 37, 8, 29, 34, 21, 61, 38, 27, 136, 61, 22, 397, 9, |
| ] |
|
|
| describe('normalizeFinnishText', () => { |
| it('mirrors the Python punctuation normalization rules', () => { |
| expect(normalizeFinnishText('terve maailma')).toBe('Terve maailma.') |
| expect(normalizeFinnishText('Hei…')).toBe('Hei,') |
| }) |
| }) |
|
|
| describe('encodeFinnishText', () => { |
| it('reproduces the expected Finnish token ids from tokenizer.json', async () => { |
| const tokenizerPath = resolve( |
| process.cwd(), |
| '..', |
| 'Chatterbox-Finnish', |
| 'pretrained_models', |
| 'tokenizer.json', |
| ) |
| const tokenizerJson = JSON.parse(await readFile(tokenizerPath, 'utf8')) |
| const tokenizer = createTokenizer(tokenizerJson) |
|
|
| expect(encodeFinnishText(tokenizer, SAMPLE_TEXT)).toEqual(EXPECTED_IDS) |
| }) |
| }) |
|
|