File size: 1,325 Bytes
e6863e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import { readFile } from 'node:fs/promises'
import { resolve } from 'node:path'
import { describe, expect, it } from 'vitest'

import { createTokenizer, encodeFinnishText, normalizeFinnishText } from './tokenizer'

const SAMPLE_TEXT =
  'Tervetuloa kokeilemaan hienoviritettyä suomenkielistä Chatterbox-puhesynteesiä.'

const EXPECTED_IDS = [
  296, 44, 76, 33, 34, 99, 14, 2, 24, 28, 93, 22, 64, 26, 14, 43, 2, 21, 22, 50,
  28, 35, 98, 60, 18, 33, 210, 397, 2, 133, 65, 50, 24, 236, 25, 54, 33, 397, 2,
  279, 21, 48, 114, 165, 37, 8, 29, 34, 21, 61, 38, 27, 136, 61, 22, 397, 9,
]

describe('normalizeFinnishText', () => {
  it('mirrors the Python punctuation normalization rules', () => {
    expect(normalizeFinnishText('terve   maailma')).toBe('Terve maailma.')
    expect(normalizeFinnishText('Hei…')).toBe('Hei,')
  })
})

describe('encodeFinnishText', () => {
  it('reproduces the expected Finnish token ids from tokenizer.json', async () => {
    const tokenizerPath = resolve(
      process.cwd(),
      '..',
      'Chatterbox-Finnish',
      'pretrained_models',
      'tokenizer.json',
    )
    const tokenizerJson = JSON.parse(await readFile(tokenizerPath, 'utf8'))
    const tokenizer = createTokenizer(tokenizerJson)

    expect(encodeFinnishText(tokenizer, SAMPLE_TEXT)).toEqual(EXPECTED_IDS)
  })
})