Dionyssos commited on
Commit
e8c7b60
·
1 Parent(s): 4813448
Files changed (5) hide show
  1. README.md +3 -6
  2. app.py +12 -131
  3. audionar.py +0 -623
  4. requirements.txt +0 -3
  5. textual.py +0 -515
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  title: Audiogen
3
  emoji: 🍍
4
- colorFrom: green
5
- colorTo: blue
6
  sdk: gradio
7
  sdk_version: 5.41.1
8
  app_file: app.py
@@ -10,9 +10,6 @@ short_description: AudioGen for CPU
10
  license: cc-by-nc-4.0
11
  tags:
12
  - audiogen
13
- - soundscapes
14
- - shift
15
- - tts
16
  ---
17
 
18
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Audiogen
3
  emoji: 🍍
4
+ colorFrom: gray
5
+ colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.41.1
8
  app_file: app.py
 
10
  license: cc-by-nc-4.0
11
  tags:
12
  - audiogen
13
+ - audiocraft
 
 
14
  ---
15
 
 
app.py CHANGED
@@ -1,142 +1,33 @@
1
  # -*- coding: utf-8 -*-
2
- import typing
3
  import gradio as gr
4
  import numpy as np
5
- import os
6
  import torch
7
- import torch.nn as nn
8
  import soundfile
9
- from textual import only_greek_or_only_latin, transliterate_number, fix_vocals
10
- import textwrap
11
-
12
- from audionar import VitsModel, VitsTokenizer
13
  from audiocraft import AudioGen
14
 
15
-
16
-
17
  audiogen = AudioGen().eval().to('cpu')
18
 
19
 
20
-
21
-
22
-
23
-
24
-
25
- language_names = ['Ancient greek',
26
- 'English',
27
- 'Deutsch',
28
- 'French',
29
- 'Hungarian',
30
- 'Romanian',
31
- 'Serbian (Approx.)']
32
-
33
-
34
- def audionar_tts(text=None,
35
- lang='Romanian',
36
- soundscape='frogs',
37
  max_tokens=24,
38
  cache_lim=-1):
39
 
40
- # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
41
-
42
-
43
- lang_map = {
44
- 'ancient greek': 'grc',
45
- 'english': 'eng',
46
- 'deutsch': 'deu',
47
- 'french': 'fra',
48
- 'hungarian': 'hun',
49
- 'romanian': 'ron',
50
- 'serbian (approx.)': 'rmc-script_latin',
51
- }
52
-
53
-
54
- final_audio = None
55
-
56
-
57
- if text is None or text.strip() == '':
58
-
59
- x = np.zeros(4 * 16000, dtype=np.float32) # If no txt 4s of audiogen
60
-
61
- else: # VITS
62
-
63
- lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
64
-
65
- global cached_lang_code, cached_net_g, cached_tokenizer
66
-
67
- if 'cached_lang_code' not in globals() or cached_lang_code != lang_code:
68
- cached_lang_code = lang_code
69
- cached_net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval()
70
- cached_tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
71
-
72
- net_g = cached_net_g
73
- tokenizer = cached_tokenizer
74
- text = only_greek_or_only_latin(text, lang=lang_code)
75
- text = transliterate_number(text, lang=lang_code)
76
- text = fix_vocals(text, lang=lang_code) + '!' # assures the text has at least 1 character that has token emb
77
-
78
-
79
- sentences = textwrap.wrap(text, width=439)
80
-
81
- total_audio_parts = []
82
- for sentence in sentences:
83
- inputs = cached_tokenizer(sentence, return_tensors="pt")
84
- with torch.no_grad():
85
- audio_part = cached_net_g(
86
- input_ids=inputs.input_ids,
87
- attention_mask=inputs.attention_mask,
88
- lang_code=lang_code,
89
- )[0, :]
90
- total_audio_parts.append(audio_part)
91
-
92
- x = torch.cat(total_audio_parts).cpu().numpy()
93
-
94
-
95
  if soundscape and soundscape.strip():
96
 
97
 
98
- speech_duration_secs = len(x) / 16000
99
- target_duration = max(speech_duration_secs + 0.74, 2.0)
100
  # Sink Attn
101
  background_audio = audiogen.generate(
102
  soundscape[:64], # to have shape of cross attention not grow large of T5 Num tokens
103
- duration=target_duration,
104
- max_tokens=min( max(7, int(max_tokens)), 288 ), # limit sounds tokens (clone beyond)
105
- cache_lim=min( max(6, int(cache_lim)), 2000),
106
  ).numpy()
107
 
108
- # PAD
109
-
110
- len_speech = len(x)
111
- len_background = len(background_audio)
112
-
113
- if len_background > len_speech:
114
- padding = np.zeros(len_background - len_speech,
115
- dtype=np.float32)
116
- x = np.concatenate([x, padding])
117
- elif len_speech > len_background:
118
- padding = np.zeros(len_speech - len_background,
119
- dtype=np.float32)
120
- background_audio = np.concatenate([background_audio, padding])
121
-
122
-
123
- x = x[:, None]
124
- background_audio = background_audio[:, None]
125
-
126
-
127
- final_audio = np.concatenate([
128
- 0.49 * x + 0.51 * background_audio,
129
- 0.51 * background_audio + 0.49 * x
130
- ], 1)
131
-
132
- else:
133
-
134
- final_audio = x
135
-
136
 
137
  wavfile = '_vits_.wav'
138
 
139
- soundfile.write(wavfile, final_audio, 16000) # soundfile needs [time, channels]
140
  return wavfile
141
 
142
  # TTS
@@ -146,23 +37,13 @@ def audionar_tts(text=None,
146
  with gr.Blocks() as demo:
147
  with gr.Row():
148
  text_input = gr.Textbox(
149
- label="Type text for TTS:",
150
- placeholder="Type Text for TTS",
151
- lines=4,
152
- value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
153
- )
154
- lang_dropdown = gr.Dropdown(
155
- choices=language_names,
156
- label="Lang",
157
- value=language_names[0],
158
- )
159
- soundscape_input = gr.Textbox(
160
- lines=1,
161
- value="dogs barging",
162
- label="AudioGen Txt"
163
  )
164
  cache_lim = gr.Number(
165
- label="Flush kv",
166
  value=71,
167
  )
168
  n_tokens = gr.Number(
@@ -176,7 +57,7 @@ with gr.Blocks() as demo:
176
 
177
  generate_button.click(
178
  fn=audionar_tts,
179
- inputs=[text_input, lang_dropdown, soundscape_input, n_tokens, cache_lim],
180
  outputs=[output_audio]
181
  )
182
  demo.launch(debug=True)
 
1
  # -*- coding: utf-8 -*-
 
2
  import gradio as gr
3
  import numpy as np
 
4
  import torch
 
5
  import soundfile
 
 
 
 
6
  from audiocraft import AudioGen
7
 
 
 
8
  audiogen = AudioGen().eval().to('cpu')
9
 
10
 
11
+ def audionar_tts(text='frogs',
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  max_tokens=24,
13
  cache_lim=-1):
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  if soundscape and soundscape.strip():
16
 
17
 
18
+ dur_seconds = max(max_tokens * 320 / 16000 + 0.74, 2.0)
 
19
  # Sink Attn
20
  background_audio = audiogen.generate(
21
  soundscape[:64], # to have shape of cross attention not grow large of T5 Num tokens
22
+ duration=dur_seconds,
23
+ max_tokens=max(7, int(max_tokens)), # kv cache lowest n_preserve
24
+ cache_lim=max(6, int(cache_lim)),
25
  ).numpy()
26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  wavfile = '_vits_.wav'
29
 
30
+ soundfile.write(wavfile, background_audio, 16000) # soundfile needs [time, channels]
31
  return wavfile
32
 
33
  # TTS
 
37
  with gr.Blocks() as demo:
38
  with gr.Row():
39
  text_input = gr.Textbox(
40
+ label="AudioGen Txt:",
41
+ placeholder="Describe sound - Type Any language",
42
+ lines=2,
43
+ value='dogs barg',
 
 
 
 
 
 
 
 
 
 
44
  )
45
  cache_lim = gr.Number(
46
+ label="kv Cache Flush:",
47
  value=71,
48
  )
49
  n_tokens = gr.Number(
 
57
 
58
  generate_button.click(
59
  fn=audionar_tts,
60
+ inputs=[text, n_tokens, cache_lim],
61
  outputs=[output_audio]
62
  )
63
  demo.launch(debug=True)
audionar.py DELETED
@@ -1,623 +0,0 @@
1
- import math
2
- import numpy as np
3
- import torch
4
- from torch import nn
5
- from transformers.modeling_utils import PreTrainedModel
6
- from transformers.configuration_utils import PretrainedConfig
7
- import json
8
- import os
9
- import re
10
- from transformers.tokenization_utils import PreTrainedTokenizer
11
- import phonemizer
12
- import torch.nn.functional as F
13
-
14
-
15
-
16
- OSCILLATION = {
17
- 'deu': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
18
- 'rmc-script_latin': [2, 2, 1, 2, 2],
19
- 'hun': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
20
- 'fra': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
21
- 'eng': [1, 2, 2, 1, 2, 2],
22
- 'grc': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
23
- 'ron': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2],
24
- }
25
-
26
-
27
- def has_non_roman_characters(input_string):
28
- # Find any character outside the ASCII range
29
- non_roman_pattern = re.compile(r"[^\x00-\x7F]")
30
-
31
- # Search the input string for non-Roman characters
32
- match = non_roman_pattern.search(input_string)
33
- has_non_roman = match is not None
34
- return has_non_roman
35
-
36
-
37
- class VitsConfig(PretrainedConfig):
38
-
39
- model_type = "vits"
40
-
41
- def __init__(
42
- self,
43
- vocab_size=38,
44
- hidden_size=192,
45
- num_hidden_layers=6,
46
- num_attention_heads=2,
47
- use_bias=True,
48
- ffn_dim=768,
49
- ffn_kernel_size=3,
50
- flow_size=192,
51
- # hidden_act="relu",
52
- upsample_initial_channel=512,
53
- upsample_rates=[8, 8, 2, 2],
54
- upsample_kernel_sizes=[16, 16, 4, 4],
55
- resblock_kernel_sizes=[3, 7, 11],
56
- resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
57
- prior_encoder_num_flows=4,
58
- prior_encoder_num_wavenet_layers=4,
59
- wavenet_kernel_size=5,
60
- **kwargs,
61
- ):
62
- self.vocab_size = vocab_size
63
- self.hidden_size = hidden_size
64
- self.num_hidden_layers = num_hidden_layers
65
- self.num_attention_heads = num_attention_heads
66
- self.use_bias = use_bias
67
- self.ffn_dim = ffn_dim
68
- self.ffn_kernel_size = ffn_kernel_size
69
- self.flow_size = flow_size
70
- self.upsample_initial_channel = upsample_initial_channel
71
- self.upsample_rates = upsample_rates
72
- self.upsample_kernel_sizes = upsample_kernel_sizes
73
- self.resblock_kernel_sizes = resblock_kernel_sizes
74
- self.resblock_dilation_sizes = resblock_dilation_sizes
75
- self.prior_encoder_num_flows = prior_encoder_num_flows
76
- self.prior_encoder_num_wavenet_layers = prior_encoder_num_wavenet_layers
77
- self.wavenet_kernel_size = wavenet_kernel_size
78
- super().__init__()
79
-
80
-
81
- class VitsWaveNet(torch.nn.Module):
82
- def __init__(self, config, num_layers):
83
- super().__init__()
84
- self.hidden_size = config.hidden_size
85
- self.num_layers = num_layers
86
- self.in_layers = torch.nn.ModuleList()
87
- self.res_skip_layers = torch.nn.ModuleList()
88
- # if hasattr(nn.utils.parametrizations, "weight_norm"):
89
- # # raise ValueError
90
- weight_norm = nn.utils.parametrizations.weight_norm
91
- # else:
92
- # raise ValueError
93
- # # weight_norm = nn.utils.weight_norm
94
- for i in range(num_layers):
95
-
96
- in_layer = torch.nn.Conv1d(
97
- in_channels=config.hidden_size,
98
- out_channels=2 * config.hidden_size,
99
- kernel_size=config.wavenet_kernel_size,
100
- dilation=1,
101
- padding=2,
102
- )
103
- in_layer = weight_norm(in_layer, name="weight")
104
- self.in_layers.append(in_layer)
105
-
106
- # last one is not necessary
107
- if i < num_layers - 1:
108
- res_skip_channels = 2 * config.hidden_size
109
- else:
110
- res_skip_channels = config.hidden_size
111
- res_skip_layer = torch.nn.Conv1d(config.hidden_size, res_skip_channels, 1)
112
- res_skip_layer = weight_norm(res_skip_layer, name="weight")
113
- self.res_skip_layers.append(res_skip_layer)
114
-
115
- def forward(self,
116
- inputs):
117
- outputs = torch.zeros_like(inputs)
118
- num_channels = torch.IntTensor([self.hidden_size])[0]
119
- for i in range(self.num_layers):
120
- in_act = self.in_layers[i](inputs)
121
- # global_states = torch.zeros_like(hidden_states) # style ?
122
- # acts = fused_add_tanh_sigmoid_multiply(hidden_states, global_states, num_channels_tensor[0])
123
- # --
124
- # def fused_add_tanh_sigmoid_multiply(input_a, input_b, num_channels):
125
- # in_act = input_a # + input_b
126
- t_act = torch.tanh(in_act[:, :num_channels, :])
127
- s_act = torch.sigmoid(in_act[:, num_channels:, :])
128
- acts = t_act * s_act
129
- res_skip_acts = self.res_skip_layers[i](acts)
130
- if i < self.num_layers - 1:
131
- res_acts = res_skip_acts[:, : self.hidden_size, :]
132
- inputs = inputs + res_acts
133
- outputs = outputs + res_skip_acts[:, self.hidden_size :, :]
134
- else:
135
- outputs = outputs + res_skip_acts
136
- return outputs
137
-
138
- # Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
139
- class HifiGanResidualBlock(nn.Module):
140
- def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
141
- super().__init__()
142
- self.leaky_relu_slope = leaky_relu_slope
143
-
144
- self.convs1 = nn.ModuleList(
145
- [
146
- nn.Conv1d(
147
- channels,
148
- channels,
149
- kernel_size,
150
- stride=1,
151
- dilation=dilation[i],
152
- padding=self.get_padding(kernel_size, dilation[i]),
153
- )
154
- for i in range(len(dilation))
155
- ]
156
- )
157
- self.convs2 = nn.ModuleList(
158
- [
159
- nn.Conv1d(
160
- channels,
161
- channels,
162
- kernel_size,
163
- stride=1,
164
- dilation=1,
165
- padding=self.get_padding(kernel_size, 1),
166
- )
167
- for _ in range(len(dilation))
168
- ]
169
- )
170
-
171
- def get_padding(self, kernel_size, dilation=1):
172
- # 1, 3, 5, 15
173
- return (kernel_size * dilation - dilation) // 2
174
-
175
- def forward(self, hidden_states):
176
- for conv1, conv2 in zip(self.convs1, self.convs2):
177
- residual = hidden_states
178
- hidden_states = nn.functional.leaky_relu(hidden_states, negative_slope=self.leaky_relu_slope)
179
- hidden_states = conv1(hidden_states)
180
- hidden_states = nn.functional.leaky_relu(hidden_states, negative_slope=self.leaky_relu_slope)
181
- hidden_states = conv2(hidden_states)
182
- hidden_states = hidden_states + residual
183
- return hidden_states
184
-
185
-
186
- class VitsHifiGan(nn.Module):
187
- def __init__(self, config):
188
- super().__init__()
189
- self.config = config
190
- self.num_kernels = len(config.resblock_kernel_sizes)
191
- self.num_upsamples = len(config.upsample_rates)
192
- self.conv_pre = nn.Conv1d(
193
- config.flow_size,
194
- config.upsample_initial_channel,
195
- kernel_size=7,
196
- stride=1,
197
- padding=3,
198
- )
199
-
200
- self.upsampler = nn.ModuleList()
201
- for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
202
- self.upsampler.append(
203
- nn.ConvTranspose1d(
204
- config.upsample_initial_channel // (2**i),
205
- config.upsample_initial_channel // (2 ** (i + 1)),
206
- kernel_size=kernel_size,
207
- stride=upsample_rate,
208
- padding=(kernel_size - upsample_rate) // 2,
209
- )
210
- )
211
-
212
- self.resblocks = nn.ModuleList()
213
- for i in range(len(self.upsampler)):
214
- channels = config.upsample_initial_channel // (2 ** (i + 1))
215
- for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
216
- self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation))
217
- self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False)
218
-
219
- def forward(self,
220
- spectrogram):
221
- hidden_states = self.conv_pre(spectrogram)
222
- for i in range(self.num_upsamples):
223
- hidden_states = F.leaky_relu(hidden_states, negative_slope=.1, inplace=True)
224
- hidden_states = self.upsampler[i](hidden_states)
225
- res_state = self.resblocks[i * self.num_kernels](hidden_states)
226
- for j in range(1, self.num_kernels):
227
- res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
228
- hidden_states = res_state / self.num_kernels
229
- hidden_states = F.leaky_relu(hidden_states, negative_slope=.01, inplace=True)
230
- hidden_states = self.conv_post(hidden_states)
231
- waveform = torch.tanh(hidden_states)
232
- return waveform
233
-
234
-
235
- class VitsResidualCouplingLayer(nn.Module):
236
- def __init__(self, config):
237
- super().__init__()
238
- self.half_channels = config.flow_size // 2
239
- self.conv_pre = nn.Conv1d(self.half_channels, config.hidden_size, 1)
240
- self.wavenet = VitsWaveNet(config, num_layers=config.prior_encoder_num_wavenet_layers)
241
- self.conv_post = nn.Conv1d(config.hidden_size, self.half_channels, 1)
242
-
243
- def forward(self,
244
- x,
245
- reverse=False):
246
- first_half, second_half = torch.split(x, [self.half_channels] * 2, dim=1)
247
- hidden_states = self.conv_pre(first_half)
248
- hidden_states = self.wavenet(hidden_states)
249
- mean = self.conv_post(hidden_states)
250
- second_half = (second_half - mean)
251
- outputs = torch.cat([first_half, second_half], dim=1)
252
- return outputs
253
-
254
-
255
- class VitsResidualCouplingBlock(nn.Module):
256
- def __init__(self, config):
257
- super().__init__()
258
- self.flows = nn.ModuleList()
259
- for _ in range(config.prior_encoder_num_flows):
260
- self.flows.append(VitsResidualCouplingLayer(config))
261
-
262
- def forward(self, x, reverse=False):
263
- # x L [1, 192, 481]
264
- for flow in reversed(self.flows):
265
- x = torch.flip(x, [1]) # flipud CHANNELs
266
- x = flow(x, reverse=True)
267
- return x
268
-
269
-
270
- class VitsAttention(nn.Module):
271
- """has no positional info"""
272
-
273
- def __init__(self, config):
274
- super().__init__()
275
- self.embed_dim = config.hidden_size
276
- self.num_heads = config.num_attention_heads
277
-
278
-
279
-
280
- self.head_dim = self.embed_dim // self.num_heads
281
- self.scaling = self.head_dim**-0.5
282
- self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
283
- self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
284
- self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
285
- self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
286
-
287
- def _shape(self, tensor, seq_len, bsz):
288
- return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
289
-
290
- def forward(
291
- self,
292
- hidden_states,
293
- layer_head_mask = None,
294
- output_attentions = False,
295
- ):
296
-
297
-
298
- bsz, tgt_len, _ = hidden_states.size()
299
-
300
- # Q
301
-
302
- query_states = self.q_proj(hidden_states) * self.scaling
303
-
304
- # K/V
305
- hidden_states = hidden_states[:, :40, :] # drop time-frames from k/v [bs*2, time, 96=ch]
306
- key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
307
- value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
308
- proj_shape = (bsz * self.num_heads, -1, self.head_dim)
309
- query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
310
- key_states = key_states.view(*proj_shape)
311
- value_states = value_states.view(*proj_shape)
312
-
313
-
314
-
315
- attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
316
- attn_weights = nn.functional.softmax(attn_weights, dim=-1)
317
- attn_output = torch.bmm(attn_weights,
318
- value_states)
319
- attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
320
- attn_output = attn_output.transpose(1, 2)
321
-
322
- # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
323
- # partitioned aross GPUs when using tensor-parallelism.
324
- attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
325
-
326
- attn_output = self.out_proj(attn_output)
327
-
328
- return attn_output
329
-
330
-
331
- class VitsFeedForward(nn.Module):
332
- def __init__(self, config):
333
- super().__init__()
334
- self.conv_1 = nn.Conv1d(config.hidden_size, config.ffn_dim, config.ffn_kernel_size, padding=1)
335
- self.conv_2 = nn.Conv1d(config.ffn_dim, config.hidden_size, config.ffn_kernel_size, padding=1)
336
-
337
- def forward(self, hidden_states):
338
- hidden_states = hidden_states.permute(0, 2, 1)
339
- hidden_states = F.relu(self.conv_1(hidden_states)) # inplace changes sound ;
340
- hidden_states = self.conv_2(hidden_states)
341
- hidden_states = hidden_states.permute(0, 2, 1)
342
- return hidden_states
343
-
344
-
345
- class VitsEncoderLayer(nn.Module):
346
- def __init__(self, config):
347
- super().__init__()
348
- self.attention = VitsAttention(config)
349
- self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
350
- self.feed_forward = VitsFeedForward(config)
351
- self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
352
-
353
- def forward(
354
- self,
355
- hidden_states,
356
- output_attentions = False,
357
- ):
358
- residual = hidden_states
359
- hidden_states = self.attention(
360
- hidden_states=hidden_states,
361
- # attention_mask=attention_mask,
362
- output_attentions=output_attentions,
363
- )
364
-
365
-
366
- hidden_states = self.layer_norm(residual + hidden_states)
367
-
368
- residual = hidden_states
369
- hidden_states = self.feed_forward(hidden_states)
370
-
371
- hidden_states = self.final_layer_norm(residual + hidden_states)
372
-
373
- outputs = (hidden_states,)
374
-
375
- return outputs
376
-
377
-
378
- class VitsEncoder(nn.Module):
379
- def __init__(self, config):
380
- super().__init__()
381
- self.config = config
382
- self.layers = nn.ModuleList([VitsEncoderLayer(config) for _ in range(config.num_hidden_layers)])
383
-
384
- def forward(
385
- self,
386
- hidden_states):
387
- for _layer in self.layers:
388
- layer_outputs = _layer(hidden_states)
389
- hidden_states = layer_outputs[0]
390
- return hidden_states
391
-
392
-
393
-
394
- class VitsTextEncoder(nn.Module):
395
- """
396
- Has VitsEncoder
397
- """
398
-
399
- def __init__(self, config):
400
- super().__init__()
401
- self.config = config
402
- self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
403
- self.encoder = VitsEncoder(config) # 6 Layers of VitsAttention
404
- self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
405
-
406
- def forward(self,
407
- input_ids
408
- ):
409
- hidden_states = self.embed_tokens(input_ids) * 4 #Actually4-or-4.856406460551018-@-845-len-ids-deu
410
- stats = self.project(self.encoder(hidden_states=hidden_states).transpose(1, 2)).transpose(1, 2)
411
- return stats[:, :, :self.config.flow_size] # prior_means
412
-
413
-
414
- class VitsPreTrainedModel(PreTrainedModel):
415
- config_class = VitsConfig
416
- base_model_prefix = "vits"
417
- main_input_name = "input_ids"
418
- supports_gradient_checkpointing = True
419
-
420
-
421
-
422
- class VitsModel(VitsPreTrainedModel):
423
- def __init__(self, config):
424
- super().__init__(config)
425
- self.config = config
426
- self.text_encoder = VitsTextEncoder(config) # has VitsEncoder that includes 6L of VitsAttention
427
- self.flow = VitsResidualCouplingBlock(config)
428
- self.decoder = VitsHifiGan(config)
429
-
430
- def forward(
431
- self,
432
- input_ids = None,
433
- attention_mask = None,
434
- speaker_id = None,
435
- output_attentions = None,
436
- output_hidden_states = None,
437
- return_dict = None,
438
- labels = None,
439
- speed = None,
440
- lang_code = 'deu', # speed oscillation pattern per voice/lang
441
- ):
442
- mask_dtype = self.text_encoder.embed_tokens.weight.dtype
443
- if attention_mask is not None:
444
- input_padding_mask = attention_mask.unsqueeze(-1).to(mask_dtype)
445
- else:
446
- raise ValueError
447
- input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).to(mask_dtype)
448
- prior_means = self.text_encoder(input_ids=input_ids)
449
-
450
- input_padding_mask = input_padding_mask.transpose(1, 2)
451
-
452
-
453
- bs, in_len, _ = prior_means.shape
454
- # VITS Duration Oscillation
455
- pattern = OSCILLATION.get(lang_code, [1, 2, 1])
456
-
457
- duration = torch.tensor(pattern,
458
- device=prior_means.device).repeat(int(in_len / len(pattern)) + 2)[None, None, :in_len] # perhaps define [1, 2, 1] per voice or language
459
- duration[:, :, 0] = 4
460
- duration[:, :, -1] = 3
461
- # ATTN
462
- predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
463
- indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
464
- output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
465
- output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
466
- attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
467
- batch_size, _, output_length, input_length = attn_mask.shape
468
- cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
469
- indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
470
- valid_indices = indices.unsqueeze(0) < cum_duration
471
- valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
472
- padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
473
- attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
474
- attn = attn[:, 0, :, :]
475
-
476
-
477
- attn = attn + 1e-4 * torch.rand_like(attn)
478
- attn /= attn.sum(2, keepdims=True)
479
- #print(attn)
480
- prior_means = torch.matmul(attn, prior_means) # try attn to contain .5/.5 instead of 1/0 so it smoothly interpolates repeated prior_means
481
-
482
- #prior_means = F.interpolate(prior_means.transpose(1,2), int(1.74 * prior_means.shape[1]), mode='linear').transpose(1,2) # extend for slow speed
483
-
484
-
485
-
486
- # prior means have now been replicated x duration of each prior mean
487
-
488
- latents = self.flow(prior_means.transpose(1, 2), # + torch.randn_like(prior_means) * .94,
489
- reverse=True)
490
-
491
- waveform = self.decoder(latents) # [bs, 1, 16000]
492
-
493
- return waveform[:, 0, :]
494
-
495
-
496
- class VitsTokenizer(PreTrainedTokenizer):
497
- vocab_files_names = {"vocab_file": "vocab.json"}
498
- model_input_names = ["input_ids", "attention_mask"]
499
-
500
- def __init__(
501
- self,
502
- vocab_file,
503
- pad_token="<pad>",
504
- unk_token="<unk>",
505
- language=None,
506
- add_blank=True,
507
- normalize=True,
508
- phonemize=True,
509
- is_uroman=False,
510
- **kwargs,
511
- ):
512
- with open(vocab_file, encoding="utf-8") as vocab_handle:
513
- self.encoder = json.load(vocab_handle)
514
-
515
- self.decoder = {v: k for k, v in self.encoder.items()}
516
- self.language = language
517
- self.add_blank = add_blank
518
- self.normalize = normalize
519
- self.phonemize = phonemize
520
-
521
- self.is_uroman = is_uroman
522
-
523
- super().__init__(
524
- pad_token=pad_token,
525
- unk_token=unk_token,
526
- language=language,
527
- add_blank=add_blank,
528
- normalize=normalize,
529
- phonemize=phonemize,
530
- is_uroman=is_uroman,
531
- **kwargs,
532
- )
533
-
534
- @property
535
- def vocab_size(self):
536
- return len(self.encoder)
537
-
538
- def get_vocab(self):
539
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
540
- vocab.update(self.added_tokens_encoder)
541
- return vocab
542
-
543
- def normalize_text(self, input_string):
544
- """Lowercase the input string, respecting any special token ids that may be part or entirely upper-cased."""
545
- all_vocabulary = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())
546
- filtered_text = ""
547
-
548
- i = 0
549
- while i < len(input_string):
550
- found_match = False
551
- for word in all_vocabulary:
552
- if input_string[i : i + len(word)] == word:
553
- filtered_text += word
554
- i += len(word)
555
- found_match = True
556
- break
557
-
558
- if not found_match:
559
- filtered_text += input_string[i].lower()
560
- i += 1
561
-
562
- return filtered_text
563
-
564
- def _preprocess_char(self, text):
565
- """Special treatment of characters in certain languages"""
566
- if self.language == "ron":
567
- text = text.replace("ț", "ţ")
568
- return text
569
-
570
- def prepare_for_tokenization(
571
- self, text: str, is_split_into_words: bool = False, normalize = None, **kwargs):
572
-
573
- normalize = normalize if normalize is not None else self.normalize
574
-
575
- if normalize:
576
- # normalise for casing
577
- text = self.normalize_text(text)
578
-
579
- filtered_text = self._preprocess_char(text)
580
-
581
- if has_non_roman_characters(filtered_text) and self.is_uroman:
582
- # 7 langs - For now replace all to romans in app.py
583
- raise ValueError
584
-
585
- if self.phonemize:
586
- if not is_phonemizer_available():
587
- raise ImportError("Please install the `phonemizer` Python package to use this tokenizer.")
588
-
589
- filtered_text = phonemizer.phonemize(
590
- filtered_text,
591
- language="en-us",
592
- backend="espeak",
593
- strip=True,
594
- preserve_punctuation=True,
595
- with_stress=True,
596
- )
597
- filtered_text = re.sub(r"\s+", " ", filtered_text)
598
- elif normalize:
599
- # strip any chars outside of the vocab (punctuation)
600
- filtered_text = "".join(list(filter(lambda char: char in self.encoder, filtered_text))).strip()
601
-
602
- return filtered_text, kwargs
603
-
604
- def _tokenize(self, text):
605
- """Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters."""
606
- tokens = list(text)
607
-
608
- if self.add_blank:
609
- # sounds dyslexi if no space between letters
610
- # sounds disconnected if >2 spaces between letters
611
- interspersed = [self._convert_id_to_token(0)] * (len(tokens) * 2) # + 1) # +1 rises slice index error if tokens odd
612
- interspersed[::2] = tokens
613
- tokens = interspersed + [self._convert_id_to_token(0)] # append one last space (it has indexing error ::2 mismatch if tokens is odd)
614
-
615
- return tokens
616
-
617
- def _convert_token_to_id(self, token):
618
- """Converts a token (str) in an id using the vocab."""
619
- return self.encoder.get(token, self.encoder.get(self.unk_token))
620
-
621
- def _convert_id_to_token(self, index):
622
- """Converts an index (integer) in a token (str) using the vocab."""
623
- return self.decoder.get(index)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -3,10 +3,7 @@ torch
3
  pydantic==2.10.6
4
  transformers==4.49.0
5
  sentencepiece
6
- phonemizer
7
  soundfile
8
  omegaconf
9
- num2words
10
  numpy<2.0.0
11
  gradio==5.27.0
12
- Numbers2Words-Greek
 
3
  pydantic==2.10.6
4
  transformers==4.49.0
5
  sentencepiece
 
6
  soundfile
7
  omegaconf
 
8
  numpy<2.0.0
9
  gradio==5.27.0
 
textual.py DELETED
@@ -1,515 +0,0 @@
1
- import re
2
- import unicodedata
3
- from num2words import num2words
4
- from num2word_greek.numbers2words import convert_numbers
5
-
6
- def only_greek_or_only_latin(text, lang='grc'):
7
- '''
8
- str: The converted string in the specified target script.
9
- Characters not found in any mapping are preserved as is.
10
- Latin accented characters in the input (e.g., 'É', 'ü') will
11
- be preserved in their lowercase form (e.g., 'é', 'ü') if
12
- converting to Latin.
13
- '''
14
-
15
- # --- Mapping Dictionaries ---
16
- # Keys are in lowercase as input text is case-folded.
17
- # If the output needs to maintain original casing, additional logic is required.
18
-
19
- latin_to_greek_map = {
20
- 'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
21
- 'ch': 'τσο', # Example of a multi-character Latin sequence
22
- 'z': 'ζ', 'h': 'χ', 'i': 'ι', 'j': 'ζ', 'k': 'κ', 'l': 'λ',
23
- 'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π', 'q': 'κ',
24
- 'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ',
25
- 'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ',
26
- }
27
-
28
- greek_to_latin_map = {
29
- 'ου': 'ou', # Prioritize common diphthongs/digraphs
30
- 'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e',
31
- 'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k',
32
- 'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o',
33
- 'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', # 'y' is a common transliteration for upsilon
34
- 'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o',
35
- 'ς': 's', # Final sigma
36
- }
37
-
38
- cyrillic_to_latin_map = {
39
- # 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh',
40
- # 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
41
- # 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
42
- # 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
43
- # 'я': 'ya',
44
- # ----------------кључеви
45
- 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ж': 'z',
46
- 'з': 'z', 'и': 'i', 'ј': 'j', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n',
47
- 'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f',
48
- 'х': 'h', 'ц': 'c', 'ч': 'c', 'ш': 's', "ž": "z",
49
- 'ђ': 'dzi', 'љ': 'li', 'њ': 'ni', 'ћ': 'c', 'џ': 'dz',
50
- 'ё': 'e', 'й': 'i', 'щ': 's', 'ъ': '', 'ы': 'y', 'ь': '',
51
- 'э': 'e', 'ю': 'io', 'я': 'a',
52
- 'ѓ': 'y', 'ѕ': 's', 'ќ': 'k',
53
- }
54
-
55
- # Cyrillic to Greek on phonetic similarity.
56
- cyrillic_to_greek_map = {
57
- # 'а': 'α', 'б': 'β', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ё': 'ιο', 'ж': 'ζ',
58
- # 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'κ': 'κ', 'λ': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο',
59
- # 'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'у': 'ου', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
60
- # 'ч': 'τσ', # or τζ depending on desired sound
61
- # 'ш': 'σ', 'щ': 'σ', # approximations
62
- # 'ъ': '', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιου',
63
- # 'я': 'ια',
64
- # --------------------
65
- 'а': 'α', 'б': 'μπ', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε',
66
- 'ж': 'ζ', 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'к': 'κ',
67
- 'л': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο', 'п': 'π', 'р': 'ρ',
68
- 'с': 'τσ', 'т': 'τ', 'у': 'ού', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
69
- 'ч': 'τσ', 'ш': 'σ', 'щ': 'σ',
70
- #
71
- 'ђ': 'ντζι', 'љ': 'λι', 'њ': 'νι', 'ћ': 'τσ', 'џ': 'ντζ',
72
- 'ы': 'ι', 'ь': '',
73
- 'э': 'ε', 'ю': 'ιο', 'я': 'ια',
74
- 'ѓ': 'γ', 'ѕ': 'σ',
75
- }
76
-
77
-
78
- # Convert the input text to lowercase, preserving accents for Latin characters.
79
- # casefold() is used for more robust caseless matching across Unicode characters.
80
- lowercased_text = text.lower() #casefold()
81
- output_chars = []
82
- current_index = 0
83
-
84
- if lang == 'grc':
85
- # Combine all relevant maps for direct lookup to Greek
86
- conversion_map = {**latin_to_greek_map, **cyrillic_to_greek_map}
87
-
88
- # Sort keys by length in reverse order to handle multi-character sequences first
89
- sorted_source_keys = sorted(
90
- list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()),
91
- key=len,
92
- reverse=True
93
- )
94
-
95
- while current_index < len(lowercased_text):
96
- found_conversion = False
97
- for key in sorted_source_keys:
98
- if lowercased_text.startswith(key, current_index):
99
- output_chars.append(conversion_map[key])
100
- current_index += len(key)
101
- found_conversion = True
102
- break
103
- if not found_conversion:
104
- # If no specific mapping found, append the character as is.
105
- # This handles unmapped characters and already Greek characters.
106
- output_chars.append(lowercased_text[current_index])
107
- current_index += 1
108
- return ''.join(output_chars)
109
-
110
- else: # Default to 'lat' conversion
111
- # Combine Greek to Latin and Cyrillic to Latin maps.
112
- # Cyrillic map keys will take precedence in case of overlap if defined after Greek.
113
- combined_to_latin_map = {**greek_to_latin_map, **cyrillic_to_latin_map}
114
-
115
- # Sort all relevant source keys by length in reverse for replacement
116
- sorted_source_keys = sorted(
117
- list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()),
118
- key=len,
119
- reverse=True
120
- )
121
-
122
- while current_index < len(lowercased_text):
123
- found_conversion = False
124
- for key in sorted_source_keys:
125
- if lowercased_text.startswith(key, current_index):
126
- latin_equivalent = combined_to_latin_map[key]
127
-
128
- # Strip accents ONLY if the source character was from the Greek map.
129
- # This preserves accents on original Latin characters (like 'é')
130
- # and allows for intentional accent stripping from Greek transliterations.
131
- if key in greek_to_latin_map:
132
- normalized_latin = unicodedata.normalize('NFD', latin_equivalent)
133
- stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c))
134
- output_chars.append(stripped_latin)
135
- else:
136
- output_chars.append(latin_equivalent)
137
-
138
- current_index += len(key)
139
- found_conversion = True
140
- break
141
-
142
- if not found_conversion:
143
- # If no conversion happened from Greek or Cyrillic, append the character as is.
144
- # This preserves existing Latin characters (including accented ones from input),
145
- # numbers, punctuation, and other symbols.
146
- output_chars.append(lowercased_text[current_index])
147
- current_index += 1
148
-
149
- return ''.join(output_chars)
150
-
151
-
152
- # =====================================================
153
- #
154
-
155
- def fix_vocals(text, lang='ron'):
156
-
157
- # Longer phrases should come before shorter ones to prevent partial matches.
158
-
159
- ron_replacements = {
160
- 'ţ': 'ț',
161
- 'ț': 'ts',
162
- 'î': 'u',
163
- 'â': 'a',
164
- 'ş': 's',
165
- 'w': 'oui',
166
- 'k': 'c',
167
- 'l': 'll',
168
- # Math symbols
169
- 'sqrt': ' rădăcina pătrată din ',
170
- '^': ' la puterea ',
171
- '+': ' plus ',
172
- ' - ': ' minus ', # only replace if standalone so to not say minus if is a-b-c
173
- # '*': ' ori ', # times
174
- '/': ' împărțit la ', # divided by
175
- '=': ' egal cu ', # equals
176
- 'pi': ' pi ',
177
- '<': ' mai mic decât ',
178
- '>': ' mai mare decât',
179
- '%': ' la sută ', # percent (from previous)
180
- '≠': ' nu este egal cu ',
181
- '≤': ' mai mic sau egal cu ',
182
- '≥': ' mai mare sau egal cu ',
183
- '≈': ' aproximativ ',
184
- '∞': ' infinit ',
185
- '€': ' euro ',
186
- '$': ' dolar ',
187
- '£': ' liră ',
188
- '&': ' și ', # and
189
- '@': ' la ', # at
190
- '#': ' diez ', # hash
191
- '∑': ' sumă ',
192
- '∫': ' integrală ',
193
- '√': ' rădăcina pătrată a ', # more generic square root
194
- }
195
-
196
- eng_replacements = {
197
- 'wik': 'weaky',
198
- 'sh': 'ss',
199
- 'ch': 'ttss',
200
- 'oo': 'oeo',
201
- # Math symbols for English
202
- 'sqrt': ' square root of ',
203
- '^': ' to the power of ',
204
- '+': ' plus ',
205
- ' - ': ' minus ',
206
- # '*': ' times ',
207
- ' / ': ' divided by ',
208
- '=': ' equals ',
209
- 'pi': ' pi ',
210
- '<': ' less than ',
211
- '>': ' greater than ',
212
- # Additional common math symbols from previous list
213
- '%': ' percent ',
214
- '∑': ' sum ',
215
- '∫': ' integral ',
216
- '√': ' square root of ',
217
- '≠': ' not equals ',
218
- '≤': ' less than or equals ',
219
- '≥': ' greater than or equals ',
220
- '≈': ' approximately ',
221
- '∞': ' infinity ',
222
- '€': ' euro ',
223
- '$': ' dollar ',
224
- '£': ' pound ',
225
- '&': ' and ',
226
- '@': ' at ',
227
- '#': ' hash ',
228
- }
229
-
230
- serbian_replacements = {
231
- 'rn': 'rrn',
232
- 'ć': 'č',
233
- 'c': 'č',
234
- 'č': 'ts',
235
- 'đ': 'dz',
236
- 'j': 'i',
237
- 'l': 'lll',
238
- 'w': 'v',
239
- 'h': 'hh',
240
- # https://huggingface.co/facebook/mms-tts-rmc-script_latin
241
- 'sqrt': 'kvadratni koren iz',
242
- '^': ' na stepen ',
243
- '+': ' plus ',
244
- ' - ': ' minus ',
245
- '*': ' puta ',
246
- ' / ': ' podeljeno sa ',
247
- '=': ' jednako ',
248
- 'pi': ' pi ',
249
- '<': ' manje od ',
250
- '>': ' veće od ',
251
- '%': ' procenat ',
252
- '∑': ' suma ',
253
- '∫': ' integral ',
254
- '√': ' kvadratni koren ',
255
- '≠': ' nije jednako ',
256
- '≤': ' manje ili jednako od ',
257
- '≥': ' veće ili jednako od ',
258
- '≈': ' približno ',
259
- '∞': ' beskonačnost ',
260
- '€': ' evro ',
261
- '$': ' dolar ',
262
- '£': ' funta ',
263
- '&': ' i ',
264
- '@': ' et ',
265
- '#': ' taraba ',
266
- # Others
267
- # 'rn': 'rrn',
268
- # 'ć': 'č',
269
- # 'c': 'č',
270
- # 'đ': 'd',
271
- # 'l': 'le',
272
- # 'ij': 'i',
273
- # 'ji': 'i',
274
- # 'j': 'i',
275
- # 'služ': 'sloooozz', # 'službeno'
276
- # 'suver': 'siuveeerra', # 'suverena'
277
- # 'država': 'dirrezav', # 'država'
278
- # 'iči': 'ici', # 'Graniči'
279
- # 's ': 'se', # a s with space
280
- # 'q': 'ku',
281
- # 'w': 'aou',
282
- # 'z': 's',
283
- # "š": "s",
284
- # 'th': 'ta',
285
- # 'v': 'vv',
286
- # "ć": "č",
287
- # "đ": "ď",
288
- # "lj": "ľ",
289
- # "nj": "ň",
290
- # "c": "č"
291
- }
292
-
293
- deu_replacements = {
294
- 'sch': 'sh',
295
- 'ch': 'kh',
296
- 'ie': 'ee',
297
- 'ei': 'ai',
298
- 'ä': 'ae',
299
- 'ö': 'oe',
300
- 'ü': 'ue',
301
- 'ß': 'ss',
302
- # Math symbols for German
303
- 'sqrt': ' Quadratwurzel aus ',
304
- '^': ' hoch ',
305
- '+': ' plus ',
306
- ' - ': ' minus ',
307
- '*': ' mal ',
308
- ' / ': ' geteilt durch ',
309
- '=': ' gleich ',
310
- 'pi': ' pi ',
311
- '<': ' kleiner als ',
312
- '>': ' größer als',
313
- # Additional common math symbols from previous list
314
- '%': ' prozent ',
315
- '∑': ' Summe ',
316
- '∫': ' Integral ',
317
- '√': ' Quadratwurzel ',
318
- '≠': ' ungleich ',
319
- '≤': ' kleiner oder gleich ',
320
- '≥': ' größer oder gleich ',
321
- '≈': ' ungefähr ',
322
- '∞': ' unendlich ',
323
- '€': ' euro ',
324
- '$': ' dollar ',
325
- '£': ' pfund ',
326
- '&': ' und ',
327
- '@': ' at ', # 'Klammeraffe' is also common but 'at' is simpler
328
- '#': ' raute ',
329
- }
330
-
331
- fra_replacements = {
332
- # French specific phonetic replacements (add as needed)
333
- # e.g., 'ç': 's', 'é': 'e', etc.
334
- 'w': 'v',
335
- # Math symbols for French
336
- 'sqrt': ' racine carrée de ',
337
- '^': ' à la puissance ',
338
- '+': ' plus ',
339
- ' - ': ' moins ', # tiré ;
340
- '*': ' fois ',
341
- ' / ': ' divisé par ',
342
- '=': ' égale ',
343
- 'pi': ' pi ',
344
- '<': ' inférieur à ',
345
- '>': ' supérieur à ',
346
- # Add more common math symbols as needed for French
347
- '%': ' pour cent ',
348
- '∑': ' somme ',
349
- '∫': ' intégrale ',
350
- '√': ' racine carrée ',
351
- '≠': ' n\'égale pas ',
352
- '≤': ' inférieur ou égal à ',
353
- '≥': ' supérieur ou égal à ',
354
- '≈': ' approximativement ',
355
- '∞': ' infini ',
356
- '€': ' euro ',
357
- '$': ' dollar ',
358
- '£': ' livre ',
359
- '&': ' et ',
360
- '@': ' arobase ',
361
- '#': ' dièse ',
362
- }
363
-
364
- hun_replacements = {
365
- # Hungarian specific phonetic replacements (add as needed)
366
- # e.g., 'á': 'a', 'é': 'e', etc.
367
- 'ch': 'ts',
368
- 'cs': 'tz',
369
- 'g': 'gk',
370
- 'w': 'v',
371
- 'z': 'zz',
372
- # Math symbols for Hungarian
373
- 'sqrt': ' négyzetgyök ',
374
- '^': ' hatvány ',
375
- '+': ' plusz ',
376
- ' - ': ' mínusz ',
377
- '*': ' szorozva ',
378
- ' / ': ' osztva ',
379
- '=': ' egyenlő ',
380
- 'pi': ' pi ',
381
- '<': ' kisebb mint ',
382
- '>': ' nagyobb mint ',
383
- # Add more common math symbols as needed for Hungarian
384
- '%': ' százalék ',
385
- '∑': ' szumma ',
386
- '∫': ' integrál ',
387
- '√': ' négyzetgyök ',
388
- '≠': ' nem egyenlő ',
389
- '≤': ' kisebb vagy egyenlő ',
390
- '≥': ' nagyobb vagy egyenlő ',
391
- '≈': ' körülbelül ',
392
- '∞': ' végtelen ',
393
- '€': ' euró ',
394
- '$': ' dollár ',
395
- '£': ' font ',
396
- '&': ' és ',
397
- '@': ' kukac ',
398
- '#': ' kettőskereszt ',
399
- }
400
-
401
- grc_replacements = {
402
- # Ancient Greek specific phonetic replacements (add as needed)
403
- # These are more about transliterating Greek letters if they are in the input text.
404
- # Math symbols for Ancient Greek (literal translations)
405
- 'sqrt': ' τετραγωνικὴ ῥίζα ',
406
- '^': ' εἰς τὴν δύναμιν ',
407
- '+': ' σὺν ',
408
- ' - ': ' χωρὶς ',
409
- '*': ' πολλάκις ',
410
- ' / ': ' διαιρέω ',
411
- '=': ' ἴσον ',
412
- 'pi': ' πῖ ',
413
- '<': ' ἔλαττον ',
414
- '>': ' μεῖζον ',
415
- # Add more common math symbols as needed for Ancient Greek
416
- '%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
417
- '∑': ' ἄθροισμα ',
418
- '∫': ' ὁλοκλήρωμα ',
419
- '√': ' τετραγωνικὴ ῥίζα ',
420
- '≠': ' οὐκ ἴσον ',
421
- '≤': ' ἔλαττον ἢ ἴσον ',
422
- '≥': ' μεῖζον ἢ ἴσον ',
423
- '≈': ' περίπου ',
424
- '∞': ' ἄπειρον ',
425
- '€': ' εὐρώ ',
426
- '$': ' δολάριον ',
427
- '£': ' λίρα ',
428
- '&': ' καὶ ',
429
- '@': ' ἀτ ', # at
430
- '#': ' δίεση ', # hash
431
- }
432
-
433
-
434
- # Select the appropriate replacement dictionary based on the language
435
- replacements_map = {
436
- 'grc': grc_replacements,
437
- 'ron': ron_replacements,
438
- 'eng': eng_replacements,
439
- 'deu': deu_replacements,
440
- 'fra': fra_replacements,
441
- 'hun': hun_replacements,
442
- 'rmc-script_latin': serbian_replacements,
443
- }
444
-
445
- current_replacements = replacements_map.get(lang)
446
- if current_replacements:
447
- # Sort replacements by length of the key in descending order.
448
- # This is crucial for correctly replacing multi-character strings (like 'sqrt', 'sch')
449
- # before their shorter substrings ('s', 'ch', 'q', 'r', 't').
450
- sorted_replacements = sorted(current_replacements.items(), key=lambda item: len(item[0]), reverse=True)
451
- for old, new in sorted_replacements:
452
- text = text.replace(old, new)
453
- return text
454
- else:
455
- # If the language is not supported, return the original text
456
- print(f"Warning: Language '{lang}' not supported for text replacement. Returning original text.")
457
- return text
458
-
459
-
460
- def _num2words(text='01234', lang=None):
461
- if lang == 'grc':
462
- return convert_numbers(text)
463
- return num2words(text, lang=lang) # HAS TO BE kwarg lang=lang
464
-
465
-
466
- def transliterate_number(number_string,
467
- lang=None):
468
- if lang == 'rmc-script_latin':
469
- lang = 'sr'
470
- exponential_pronoun = ' puta deset na stepen od '
471
- comma = ' tačka '
472
- elif lang == 'ron':
473
- lang = 'ro'
474
- exponential_pronoun = ' tízszer a erejéig '
475
- comma = ' virgulă '
476
- elif lang == 'hun':
477
- lang = 'hu'
478
- exponential_pronoun = ' tízszer a erejéig '
479
- comma = ' virgula '
480
- elif lang == 'deu':
481
- exponential_pronoun = ' mal zehn hoch '
482
- comma = ' komma '
483
- elif lang == 'fra':
484
- lang = 'fr'
485
- exponential_pronoun = ' puissance '
486
- comma = 'virgule'
487
- elif lang == 'grc':
488
- exponential_pronoun = ' εις την δυναμην του '
489
- comma = 'κομμα'
490
- else:
491
- lang = lang[:2]
492
- exponential_pronoun = ' times ten to the power of '
493
- comma = ' point '
494
-
495
- def replace_number(match):
496
- prefix = match.group(1) or ""
497
- number_part = match.group(2)
498
- suffix = match.group(5) or ""
499
-
500
- try:
501
- if 'e' in number_part.lower():
502
- base, exponent = number_part.lower().split('e')
503
- words = _num2words(base, lang=lang) + exponential_pronoun + _num2words(exponent, lang=lang)
504
- elif '.' in number_part:
505
- integer_part, decimal_part = number_part.split('.')
506
- words = _num2words(integer_part, lang=lang) + comma + " ".join(
507
- [_num2words(digit, lang=lang) for digit in decimal_part])
508
- else:
509
- words = _num2words(number_part, lang=lang)
510
- return prefix + words + suffix
511
- except ValueError:
512
- return match.group(0) # Return original if conversion fails
513
-
514
- pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
515
- return re.sub(pattern, replace_number, number_string)