File size: 1,749 Bytes
5a33bec
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import { Tokenizer } from '@huggingface/tokenizers'

export const SPACE_TOKEN = '[SPACE]'
export const START_TEXT_TOKEN = 255
export const STOP_TEXT_TOKEN = 0

export type BrowserTokenizer = Tokenizer

export function normalizeFinnishText(text: string): string {
  if (text.length === 0) {
    return 'You need to add some text for me to talk.'
  }

  let normalized = text
  if (normalized[0]?.toLowerCase() === normalized[0] && normalized[0] !== normalized[0]?.toUpperCase()) {
    normalized = normalized[0].toUpperCase() + normalized.slice(1)
  }

  normalized = normalized.split(/\s+/).join(' ')

  const punctuationReplacements: Array<[string, string]> = [
    ['...', ', '],
    ['…', ', '],
    [':', ','],
    [' - ', ', '],
    [';', ', '],
    ['—', '-'],
    ['–', '-'],
    [' ,', ','],
    ['“', '"'],
    ['”', '"'],
    ['‘', "'"],
    ['’', "'"],
  ]

  for (const [from, to] of punctuationReplacements) {
    normalized = normalized.replaceAll(from, to)
  }

  normalized = normalized.trimEnd()
  const sentenceEnders = new Set(['.', '!', '?', '-', ','])
  if (!sentenceEnders.has(normalized.at(-1) ?? '')) {
    normalized += '.'
  }

  return normalized
}

export function createTokenizer(tokenizerJson: Record<string, unknown>, tokenizerConfig: Record<string, unknown> = {}): BrowserTokenizer {
  return new Tokenizer(tokenizerJson, tokenizerConfig)
}

export function encodeFinnishText(tokenizer: BrowserTokenizer, text: string): number[] {
  const prepared = normalizeFinnishText(text).replaceAll(' ', SPACE_TOKEN)
  return tokenizer.encode(prepared, { add_special_tokens: false }).ids
}

export function wrapTextTokens(tokenIds: number[]): number[] {
  return [START_TEXT_TOKEN, ...tokenIds, STOP_TEXT_TOKEN]
}