Spaces:

dkounadis
/

audiogen2

Running

App Files Files Community

Dionyssos commited on Sep 27, 2025

Commit

e8c7b60

1 Parent(s): 4813448

No TTS

Browse files

Files changed (5) hide show

README.md +3 -6
app.py +12 -131
audionar.py +0 -623
requirements.txt +0 -3
textual.py +0 -515

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Audiogen
 emoji: 🍍
-colorFrom: green
-colorTo: blue
 sdk: gradio
 sdk_version: 5.41.1
 app_file: app.py
@@ -10,9 +10,6 @@ short_description: AudioGen for CPU
 license: cc-by-nc-4.0
 tags:
 - audiogen
-- soundscapes
-- shift
-- tts
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Audiogen
 emoji: 🍍
+colorFrom: gray
+colorTo: gray
 sdk: gradio
 sdk_version: 5.41.1
 app_file: app.py
 license: cc-by-nc-4.0
 tags:
 - audiogen
+- audiocraft
 ---

app.py CHANGED Viewed

@@ -1,142 +1,33 @@
 # -*- coding: utf-8 -*-
-import typing
 import gradio as gr
 import numpy as np
-import os
 import torch
-import torch.nn as nn
 import soundfile
-from textual import only_greek_or_only_latin, transliterate_number, fix_vocals
-import textwrap
-from audionar import VitsModel, VitsTokenizer
 from audiocraft import AudioGen
 audiogen = AudioGen().eval().to('cpu')
-language_names = ['Ancient greek',
-                  'English',
-                  'Deutsch',
-                  'French',
-                  'Hungarian',
-                  'Romanian',
-                  'Serbian (Approx.)']
-def audionar_tts(text=None,
-                 lang='Romanian',
-                 soundscape='frogs',
                  max_tokens=24,
                  cache_lim=-1):
-    # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
-    lang_map = {
-            'ancient greek': 'grc',
-            'english': 'eng',
-            'deutsch': 'deu',
-            'french': 'fra',
-            'hungarian': 'hun',
-            'romanian': 'ron',
-            'serbian (approx.)': 'rmc-script_latin',
-        }
-    final_audio = None
-    if text is None or text.strip() == '':
-        x = np.zeros(4 * 16000, dtype=np.float32)  # If no txt 4s of audiogen
-    else:  # VITS
-            lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
-            global cached_lang_code, cached_net_g, cached_tokenizer
-            if 'cached_lang_code' not in globals() or cached_lang_code != lang_code:
-                cached_lang_code = lang_code
-                cached_net_g = VitsModel.from_pretrained(f'facebook/mms-tts-{lang_code}').eval()
-                cached_tokenizer = VitsTokenizer.from_pretrained(f'facebook/mms-tts-{lang_code}')
-            net_g = cached_net_g
-            tokenizer = cached_tokenizer
-            text = only_greek_or_only_latin(text, lang=lang_code)
-            text = transliterate_number(text, lang=lang_code)
-            text = fix_vocals(text, lang=lang_code) + '!'  # assures the text has at least 1 character that has token emb
-            sentences = textwrap.wrap(text, width=439)
-            total_audio_parts = []
-            for sentence in sentences:
-                inputs = cached_tokenizer(sentence, return_tensors="pt")
-                with torch.no_grad():
-                    audio_part = cached_net_g(
-                        input_ids=inputs.input_ids,
-                        attention_mask=inputs.attention_mask,
-                        lang_code=lang_code,
-                    )[0, :]
-                total_audio_parts.append(audio_part)
-            x = torch.cat(total_audio_parts).cpu().numpy()
     if soundscape and soundscape.strip():
-        speech_duration_secs = len(x) / 16000
-        target_duration = max(speech_duration_secs + 0.74, 2.0)
         # Sink Attn
         background_audio = audiogen.generate(
             soundscape[:64],  # to have shape of cross attention not grow large of T5 Num tokens
-            duration=target_duration,
-            max_tokens=min( max(7, int(max_tokens)), 288 ),  # limit sounds tokens (clone beyond)
-            cache_lim=min( max(6, int(cache_lim)), 2000),
          ).numpy()
-        # PAD
-        len_speech = len(x)
-        len_background = len(background_audio)
-        if len_background > len_speech:
-            padding = np.zeros(len_background - len_speech,
-                                dtype=np.float32)
-            x = np.concatenate([x, padding])
-        elif len_speech > len_background:
-            padding = np.zeros(len_speech - len_background,
-                                dtype=np.float32)
-            background_audio = np.concatenate([background_audio, padding])
-        x = x[:, None]
-        background_audio = background_audio[:, None]
-        final_audio = np.concatenate([
-            0.49 * x + 0.51 * background_audio,
-            0.51 * background_audio + 0.49 * x
-        ], 1)
-    else:
-        final_audio = x
     wavfile = '_vits_.wav'
-    soundfile.write(wavfile, final_audio, 16000)   # soundfile needs [time, channels]
     return wavfile
 # TTS
@@ -146,23 +37,13 @@ def audionar_tts(text=None,
 with gr.Blocks() as demo:
     with gr.Row():
         text_input = gr.Textbox(
-            label="Type text for TTS:",
-            placeholder="Type Text for TTS",
-            lines=4,
-            value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
-        )
-        lang_dropdown = gr.Dropdown(
-            choices=language_names,
-            label="Lang",
-            value=language_names[0],
-        )
-        soundscape_input = gr.Textbox(
-            lines=1,
-            value="dogs barging",
-            label="AudioGen Txt"
         )
         cache_lim = gr.Number(
-            label="Flush kv",
             value=71,
         )
         n_tokens = gr.Number(
@@ -176,7 +57,7 @@ with gr.Blocks() as demo:
     generate_button.click(
         fn=audionar_tts,
-        inputs=[text_input, lang_dropdown, soundscape_input, n_tokens, cache_lim],
         outputs=[output_audio]
     )
 demo.launch(debug=True)

 # -*- coding: utf-8 -*-
 import gradio as gr
 import numpy as np
 import torch
 import soundfile
 from audiocraft import AudioGen
 audiogen = AudioGen().eval().to('cpu')
+def audionar_tts(text='frogs',
                  max_tokens=24,
                  cache_lim=-1):
     if soundscape and soundscape.strip():
+        dur_seconds = max(max_tokens * 320 / 16000 + 0.74, 2.0)
         # Sink Attn
         background_audio = audiogen.generate(
             soundscape[:64],  # to have shape of cross attention not grow large of T5 Num tokens
+            duration=dur_seconds,
+            max_tokens=max(7, int(max_tokens)),  # kv cache lowest n_preserve
+            cache_lim=max(6, int(cache_lim)),
          ).numpy()
     wavfile = '_vits_.wav'
+    soundfile.write(wavfile, background_audio, 16000)   # soundfile needs [time, channels]
     return wavfile
 # TTS
 with gr.Blocks() as demo:
     with gr.Row():
         text_input = gr.Textbox(
+            label="AudioGen Txt:",
+            placeholder="Describe sound - Type Any language",
+            lines=2,
+            value='dogs barg',
         )
         cache_lim = gr.Number(
+            label="kv Cache Flush:",
             value=71,
         )
         n_tokens = gr.Number(
     generate_button.click(
         fn=audionar_tts,
+        inputs=[text, n_tokens, cache_lim],
         outputs=[output_audio]
     )
 demo.launch(debug=True)

audionar.py DELETED Viewed

@@ -1,623 +0,0 @@
-import math
-import numpy as np
-import torch
-from torch import nn
-from transformers.modeling_utils import PreTrainedModel
-from transformers.configuration_utils import PretrainedConfig
-import json
-import os
-import re
-from transformers.tokenization_utils import PreTrainedTokenizer
-import phonemizer
-import torch.nn.functional as F
-OSCILLATION = {
-        'deu': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
-        'rmc-script_latin': [2, 2, 1, 2, 2],
-        'hun': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
-        'fra': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
-        'eng': [1, 2, 2, 1, 2, 2],
-        'grc': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1],
-        'ron': [1, 2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2],
-    }
-def has_non_roman_characters(input_string):
-    # Find any character outside the ASCII range
-    non_roman_pattern = re.compile(r"[^\x00-\x7F]")
-    # Search the input string for non-Roman characters
-    match = non_roman_pattern.search(input_string)
-    has_non_roman = match is not None
-    return has_non_roman
-class VitsConfig(PretrainedConfig):
-    model_type = "vits"
-    def __init__(
-        self,
-        vocab_size=38,
-        hidden_size=192,
-        num_hidden_layers=6,
-        num_attention_heads=2,
-        use_bias=True,
-        ffn_dim=768,
-        ffn_kernel_size=3,
-        flow_size=192,
-        # hidden_act="relu",
-        upsample_initial_channel=512,
-        upsample_rates=[8, 8, 2, 2],
-        upsample_kernel_sizes=[16, 16, 4, 4],
-        resblock_kernel_sizes=[3, 7, 11],
-        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
-        prior_encoder_num_flows=4,
-        prior_encoder_num_wavenet_layers=4,
-        wavenet_kernel_size=5,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.use_bias = use_bias
-        self.ffn_dim = ffn_dim
-        self.ffn_kernel_size = ffn_kernel_size
-        self.flow_size = flow_size
-        self.upsample_initial_channel = upsample_initial_channel
-        self.upsample_rates = upsample_rates
-        self.upsample_kernel_sizes = upsample_kernel_sizes
-        self.resblock_kernel_sizes = resblock_kernel_sizes
-        self.resblock_dilation_sizes = resblock_dilation_sizes
-        self.prior_encoder_num_flows = prior_encoder_num_flows
-        self.prior_encoder_num_wavenet_layers = prior_encoder_num_wavenet_layers
-        self.wavenet_kernel_size = wavenet_kernel_size
-        super().__init__()
-class VitsWaveNet(torch.nn.Module):
-    def __init__(self, config, num_layers):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-        self.num_layers = num_layers
-        self.in_layers = torch.nn.ModuleList()
-        self.res_skip_layers = torch.nn.ModuleList()
-        # if hasattr(nn.utils.parametrizations, "weight_norm"):
-        #     # raise ValueError
-        weight_norm = nn.utils.parametrizations.weight_norm
-        # else:
-        #     raise ValueError
-        #     # weight_norm = nn.utils.weight_norm
-        for i in range(num_layers):
-            in_layer = torch.nn.Conv1d(
-                in_channels=config.hidden_size,
-                out_channels=2 * config.hidden_size,
-                kernel_size=config.wavenet_kernel_size,
-                dilation=1,
-                padding=2,
-            )
-            in_layer = weight_norm(in_layer, name="weight")
-            self.in_layers.append(in_layer)
-            # last one is not necessary
-            if i < num_layers - 1:
-                res_skip_channels = 2 * config.hidden_size
-            else:
-                res_skip_channels = config.hidden_size
-            res_skip_layer = torch.nn.Conv1d(config.hidden_size, res_skip_channels, 1)
-            res_skip_layer = weight_norm(res_skip_layer, name="weight")
-            self.res_skip_layers.append(res_skip_layer)
-    def forward(self,
-                inputs):
-        outputs = torch.zeros_like(inputs)
-        num_channels = torch.IntTensor([self.hidden_size])[0]
-        for i in range(self.num_layers):
-            in_act = self.in_layers[i](inputs)
-            # global_states = torch.zeros_like(hidden_states)  # style ?
-            # acts = fused_add_tanh_sigmoid_multiply(hidden_states, global_states, num_channels_tensor[0])
-            # --
-            # def fused_add_tanh_sigmoid_multiply(input_a, input_b, num_channels):
-            # in_act = input_a #  + input_b
-            t_act = torch.tanh(in_act[:, :num_channels, :])
-            s_act = torch.sigmoid(in_act[:, num_channels:, :])
-            acts = t_act * s_act
-            res_skip_acts = self.res_skip_layers[i](acts)
-            if i < self.num_layers - 1:
-                res_acts = res_skip_acts[:, : self.hidden_size, :]
-                inputs = inputs + res_acts
-                outputs = outputs + res_skip_acts[:, self.hidden_size :, :]
-            else:
-                outputs = outputs + res_skip_acts
-        return outputs
-# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
-class HifiGanResidualBlock(nn.Module):
-    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
-        super().__init__()
-        self.leaky_relu_slope = leaky_relu_slope
-        self.convs1 = nn.ModuleList(
-            [
-                nn.Conv1d(
-                    channels,
-                    channels,
-                    kernel_size,
-                    stride=1,
-                    dilation=dilation[i],
-                    padding=self.get_padding(kernel_size, dilation[i]),
-                )
-                for i in range(len(dilation))
-            ]
-        )
-        self.convs2 = nn.ModuleList(
-            [
-                nn.Conv1d(
-                    channels,
-                    channels,
-                    kernel_size,
-                    stride=1,
-                    dilation=1,
-                    padding=self.get_padding(kernel_size, 1),
-                )
-                for _ in range(len(dilation))
-            ]
-        )
-    def get_padding(self, kernel_size, dilation=1):
-        # 1, 3, 5, 15
-        return (kernel_size * dilation - dilation) // 2
-    def forward(self, hidden_states):
-        for conv1, conv2 in zip(self.convs1, self.convs2):
-            residual = hidden_states
-            hidden_states = nn.functional.leaky_relu(hidden_states, negative_slope=self.leaky_relu_slope)
-            hidden_states = conv1(hidden_states)
-            hidden_states = nn.functional.leaky_relu(hidden_states, negative_slope=self.leaky_relu_slope)
-            hidden_states = conv2(hidden_states)
-            hidden_states = hidden_states + residual
-        return hidden_states
-class VitsHifiGan(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.num_kernels = len(config.resblock_kernel_sizes)
-        self.num_upsamples = len(config.upsample_rates)
-        self.conv_pre = nn.Conv1d(
-            config.flow_size,
-            config.upsample_initial_channel,
-            kernel_size=7,
-            stride=1,
-            padding=3,
-        )
-        self.upsampler = nn.ModuleList()
-        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
-            self.upsampler.append(
-                nn.ConvTranspose1d(
-                    config.upsample_initial_channel // (2**i),
-                    config.upsample_initial_channel // (2 ** (i + 1)),
-                    kernel_size=kernel_size,
-                    stride=upsample_rate,
-                    padding=(kernel_size - upsample_rate) // 2,
-                )
-            )
-        self.resblocks = nn.ModuleList()
-        for i in range(len(self.upsampler)):
-            channels = config.upsample_initial_channel // (2 ** (i + 1))
-            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
-                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation))
-        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3, bias=False)
-    def forward(self,
-                spectrogram):
-        hidden_states = self.conv_pre(spectrogram)
-        for i in range(self.num_upsamples):
-            hidden_states = F.leaky_relu(hidden_states, negative_slope=.1, inplace=True)
-            hidden_states = self.upsampler[i](hidden_states)
-            res_state = self.resblocks[i * self.num_kernels](hidden_states)
-            for j in range(1, self.num_kernels):
-                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
-            hidden_states = res_state / self.num_kernels
-        hidden_states = F.leaky_relu(hidden_states, negative_slope=.01, inplace=True)
-        hidden_states = self.conv_post(hidden_states)
-        waveform = torch.tanh(hidden_states)
-        return waveform
-class VitsResidualCouplingLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.half_channels = config.flow_size // 2
-        self.conv_pre = nn.Conv1d(self.half_channels, config.hidden_size, 1)
-        self.wavenet = VitsWaveNet(config, num_layers=config.prior_encoder_num_wavenet_layers)
-        self.conv_post = nn.Conv1d(config.hidden_size, self.half_channels, 1)
-    def forward(self,
-                x,
-                reverse=False):
-        first_half, second_half = torch.split(x, [self.half_channels] * 2, dim=1)
-        hidden_states = self.conv_pre(first_half)
-        hidden_states = self.wavenet(hidden_states)
-        mean = self.conv_post(hidden_states)
-        second_half = (second_half - mean)
-        outputs = torch.cat([first_half, second_half], dim=1)
-        return outputs
-class VitsResidualCouplingBlock(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.flows = nn.ModuleList()
-        for _ in range(config.prior_encoder_num_flows):
-            self.flows.append(VitsResidualCouplingLayer(config))
-    def forward(self, x, reverse=False):
-        # x L [1, 192, 481]
-        for flow in reversed(self.flows):
-            x = torch.flip(x, [1])  # flipud CHANNELs
-            x = flow(x, reverse=True)
-        return x
-class VitsAttention(nn.Module):
-    """has no positional info"""
-    def __init__(self, config):
-        super().__init__()
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        self.scaling = self.head_dim**-0.5
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.use_bias)
-    def _shape(self, tensor, seq_len, bsz):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-    def forward(
-        self,
-        hidden_states,
-        layer_head_mask = None,
-        output_attentions = False,
-    ):
-        bsz, tgt_len, _ = hidden_states.size()
-        # Q
-        query_states = self.q_proj(hidden_states) * self.scaling
-        # K/V
-        hidden_states = hidden_states[:, :40, :]  # drop time-frames from k/v [bs*2, time, 96=ch]
-        key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-        value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.view(*proj_shape)
-        value_states = value_states.view(*proj_shape)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        attn_output = torch.bmm(attn_weights,
-                                value_states)
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned aross GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-        attn_output = self.out_proj(attn_output)
-        return attn_output
-class VitsFeedForward(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.conv_1 = nn.Conv1d(config.hidden_size, config.ffn_dim, config.ffn_kernel_size, padding=1)
-        self.conv_2 = nn.Conv1d(config.ffn_dim, config.hidden_size, config.ffn_kernel_size, padding=1)
-    def forward(self, hidden_states):
-        hidden_states = hidden_states.permute(0, 2, 1)
-        hidden_states = F.relu(self.conv_1(hidden_states))  # inplace changes sound ;
-        hidden_states = self.conv_2(hidden_states)
-        hidden_states = hidden_states.permute(0, 2, 1)
-        return hidden_states
-class VitsEncoderLayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.attention = VitsAttention(config)
-        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
-        self.feed_forward = VitsFeedForward(config)
-        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
-    def forward(
-        self,
-        hidden_states,
-        output_attentions = False,
-    ):
-        residual = hidden_states
-        hidden_states = self.attention(
-            hidden_states=hidden_states,
-            # attention_mask=attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = self.layer_norm(residual + hidden_states)
-        residual = hidden_states
-        hidden_states = self.feed_forward(hidden_states)
-        hidden_states = self.final_layer_norm(residual + hidden_states)
-        outputs = (hidden_states,)
-        return outputs
-class VitsEncoder(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.layers = nn.ModuleList([VitsEncoderLayer(config) for _ in range(config.num_hidden_layers)])
-    def forward(
-        self,
-        hidden_states):
-        for _layer in self.layers:
-            layer_outputs = _layer(hidden_states)
-            hidden_states = layer_outputs[0]
-        return hidden_states
-class VitsTextEncoder(nn.Module):
-    """
-    Has VitsEncoder
-    """
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
-        self.encoder = VitsEncoder(config)  # 6 Layers of VitsAttention
-        self.project = nn.Conv1d(config.hidden_size, config.flow_size * 2, kernel_size=1)
-    def forward(self,
-                input_ids
-                ):
-        hidden_states = self.embed_tokens(input_ids) * 4      #Actually4-or-4.856406460551018-@-845-len-ids-deu
-        stats = self.project(self.encoder(hidden_states=hidden_states).transpose(1, 2)).transpose(1, 2)
-        return stats[:, :, :self.config.flow_size]  # prior_means
-class VitsPreTrainedModel(PreTrainedModel):
-    config_class = VitsConfig
-    base_model_prefix = "vits"
-    main_input_name = "input_ids"
-    supports_gradient_checkpointing = True
-class VitsModel(VitsPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.config = config
-        self.text_encoder = VitsTextEncoder(config)  # has VitsEncoder that includes 6L of VitsAttention
-        self.flow = VitsResidualCouplingBlock(config)
-        self.decoder = VitsHifiGan(config)
-    def forward(
-        self,
-        input_ids = None,
-        attention_mask = None,
-        speaker_id = None,
-        output_attentions = None,
-        output_hidden_states = None,
-        return_dict = None,
-        labels = None,
-        speed = None,
-        lang_code = 'deu',  # speed oscillation pattern per voice/lang
-    ):
-        mask_dtype = self.text_encoder.embed_tokens.weight.dtype
-        if attention_mask is not None:
-            input_padding_mask = attention_mask.unsqueeze(-1).to(mask_dtype)
-        else:
-            raise ValueError
-            input_padding_mask = torch.ones_like(input_ids).unsqueeze(-1).to(mask_dtype)
-        prior_means = self.text_encoder(input_ids=input_ids)
-        input_padding_mask = input_padding_mask.transpose(1, 2)
-        bs, in_len, _ = prior_means.shape
-        # VITS Duration Oscillation
-        pattern = OSCILLATION.get(lang_code, [1, 2, 1])
-        duration = torch.tensor(pattern,
-                                device=prior_means.device).repeat(int(in_len / len(pattern)) + 2)[None, None, :in_len]   # perhaps define [1, 2, 1] per voice or language
-        duration[:, :, 0] = 4
-        duration[:, :, -1] = 3
-        # ATTN
-        predicted_lengths = torch.clamp_min(torch.sum(duration, [1, 2]), 1).long()
-        indices = torch.arange(predicted_lengths.max(), dtype=predicted_lengths.dtype, device=predicted_lengths.device)
-        output_padding_mask = indices.unsqueeze(0) < predicted_lengths.unsqueeze(1)
-        output_padding_mask = output_padding_mask.unsqueeze(1).to(input_padding_mask.dtype)
-        attn_mask = torch.unsqueeze(input_padding_mask, 2) * torch.unsqueeze(output_padding_mask, -1)
-        batch_size, _, output_length, input_length = attn_mask.shape
-        cum_duration = torch.cumsum(duration, -1).view(batch_size * input_length, 1)
-        indices = torch.arange(output_length, dtype=duration.dtype, device=duration.device)
-        valid_indices = indices.unsqueeze(0) < cum_duration
-        valid_indices = valid_indices.to(attn_mask.dtype).view(batch_size, input_length, output_length)
-        padded_indices = valid_indices - nn.functional.pad(valid_indices, [0, 0, 1, 0, 0, 0])[:, :-1]
-        attn = padded_indices.unsqueeze(1).transpose(2, 3) * attn_mask
-        attn = attn[:, 0, :, :]
-        attn = attn + 1e-4 * torch.rand_like(attn)
-        attn /= attn.sum(2, keepdims=True)
-        #print(attn)
-        prior_means = torch.matmul(attn, prior_means)  # try attn to contain .5/.5 instead of 1/0 so it smoothly interpolates repeated prior_means
-        #prior_means = F.interpolate(prior_means.transpose(1,2),   int(1.74 * prior_means.shape[1]), mode='linear').transpose(1,2)  # extend for slow speed
-        # prior means have now been replicated x duration of each prior mean
-        latents = self.flow(prior_means.transpose(1, 2), # + torch.randn_like(prior_means) * .94,
-                            reverse=True)
-        waveform = self.decoder(latents)  # [bs, 1, 16000]
-        return waveform[:, 0, :]
-class VitsTokenizer(PreTrainedTokenizer):
-    vocab_files_names = {"vocab_file": "vocab.json"}
-    model_input_names = ["input_ids", "attention_mask"]
-    def __init__(
-        self,
-        vocab_file,
-        pad_token="<pad>",
-        unk_token="<unk>",
-        language=None,
-        add_blank=True,
-        normalize=True,
-        phonemize=True,
-        is_uroman=False,
-        **kwargs,
-    ):
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.language = language
-        self.add_blank = add_blank
-        self.normalize = normalize
-        self.phonemize = phonemize
-        self.is_uroman = is_uroman
-        super().__init__(
-            pad_token=pad_token,
-            unk_token=unk_token,
-            language=language,
-            add_blank=add_blank,
-            normalize=normalize,
-            phonemize=phonemize,
-            is_uroman=is_uroman,
-            **kwargs,
-        )
-    @property
-    def vocab_size(self):
-        return len(self.encoder)
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-    def normalize_text(self, input_string):
-        """Lowercase the input string, respecting any special token ids that may be part or entirely upper-cased."""
-        all_vocabulary = list(self.encoder.keys()) + list(self.added_tokens_encoder.keys())
-        filtered_text = ""
-        i = 0
-        while i < len(input_string):
-            found_match = False
-            for word in all_vocabulary:
-                if input_string[i : i + len(word)] == word:
-                    filtered_text += word
-                    i += len(word)
-                    found_match = True
-                    break
-            if not found_match:
-                filtered_text += input_string[i].lower()
-                i += 1
-        return filtered_text
-    def _preprocess_char(self, text):
-        """Special treatment of characters in certain languages"""
-        if self.language == "ron":
-            text = text.replace("ț", "ţ")
-        return text
-    def prepare_for_tokenization(
-        self, text: str, is_split_into_words: bool = False, normalize = None, **kwargs):
-        normalize = normalize if normalize is not None else self.normalize
-        if normalize:
-            # normalise for casing
-            text = self.normalize_text(text)
-        filtered_text = self._preprocess_char(text)
-        if has_non_roman_characters(filtered_text) and self.is_uroman:
-            # 7 langs -  For now replace all to romans in app.py
-            raise ValueError
-        if self.phonemize:
-            if not is_phonemizer_available():
-                raise ImportError("Please install the `phonemizer` Python package to use this tokenizer.")
-            filtered_text = phonemizer.phonemize(
-                filtered_text,
-                language="en-us",
-                backend="espeak",
-                strip=True,
-                preserve_punctuation=True,
-                with_stress=True,
-            )
-            filtered_text = re.sub(r"\s+", " ", filtered_text)
-        elif normalize:
-            # strip any chars outside of the vocab (punctuation)
-            filtered_text = "".join(list(filter(lambda char: char in self.encoder, filtered_text))).strip()
-        return filtered_text, kwargs
-    def _tokenize(self, text):
-        """Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters."""
-        tokens = list(text)
-        if self.add_blank:
-            # sounds dyslexi if no space between letters
-            # sounds disconnected if >2 spaces between letters
-            interspersed = [self._convert_id_to_token(0)] * (len(tokens) * 2) # + 1)  # +1 rises slice index error if tokens odd
-            interspersed[::2] = tokens
-            tokens = interspersed + [self._convert_id_to_token(0)]  # append one last space (it has indexing error ::2 mismatch if tokens is odd)
-        return tokens
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index)

requirements.txt CHANGED Viewed

@@ -3,10 +3,7 @@ torch
 pydantic==2.10.6
 transformers==4.49.0
 sentencepiece
-phonemizer
 soundfile
 omegaconf
-num2words
 numpy<2.0.0
 gradio==5.27.0
-Numbers2Words-Greek

 pydantic==2.10.6
 transformers==4.49.0
 sentencepiece
 soundfile
 omegaconf
 numpy<2.0.0
 gradio==5.27.0

textual.py DELETED Viewed

@@ -1,515 +0,0 @@
-import re
-import unicodedata
-from num2words import num2words
-from num2word_greek.numbers2words import convert_numbers
-def only_greek_or_only_latin(text, lang='grc'):
-    '''
-        str: The converted string in the specified target script.
-             Characters not found in any mapping are preserved as is.
-             Latin accented characters in the input (e.g., 'É', 'ü') will
-             be preserved in their lowercase form (e.g., 'é', 'ü') if
-             converting to Latin.
-    '''
-    # --- Mapping Dictionaries ---
-    # Keys are in lowercase as input text is case-folded.
-    # If the output needs to maintain original casing, additional logic is required.
-    latin_to_greek_map = {
-        'a': 'α', 'b': 'β', 'g': 'γ', 'd': 'δ', 'e': 'ε',
-        'ch': 'τσο', # Example of a multi-character Latin sequence
-        'z': 'ζ', 'h': 'χ', 'i': 'ι', 'j': 'ζ', 'k': 'κ', 'l': 'λ',
-        'm': 'μ', 'n': 'ν', 'x': 'ξ', 'o': 'ο', 'p': 'π', 'q': 'κ',
-        'v': 'β', 'sc': 'σκ', 'r': 'ρ', 's': 'σ', 't': 'τ',
-        'u': 'ου', 'f': 'φ', 'c': 'σ', 'w': 'β', 'y': 'γ',
-    }
-    greek_to_latin_map = {
-        'ου': 'ou', # Prioritize common diphthongs/digraphs
-        'α': 'a', 'β': 'v', 'γ': 'g', 'δ': 'd', 'ε': 'e',
-        'ζ': 'z', 'η': 'i', 'θ': 'th', 'ι': 'i', 'κ': 'k',
-        'λ': 'l', 'μ': 'm', 'ν': 'n', 'ξ': 'x', 'ο': 'o',
-        'π': 'p', 'ρ': 'r', 'σ': 's', 'τ': 't', 'υ': 'y', # 'y' is a common transliteration for upsilon
-        'φ': 'f', 'χ': 'ch', 'ψ': 'ps', 'ω': 'o',
-        'ς': 's', # Final sigma
-    }
-    cyrillic_to_latin_map = {
-        # 'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'yo', 'ж': 'zh',
-        # 'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
-        # 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
-        # 'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ъ': '', 'ы': 'y', 'ь': '', 'э': 'e', 'ю': 'yu',
-        # 'я': 'ya',
-        # ----------------кључеви
-        'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ж': 'z',
-        'з': 'z', 'и': 'i', 'ј': 'j', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n',
-        'о': 'o', 'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f',
-        'х': 'h', 'ц': 'c', 'ч': 'c', 'ш': 's', "ž": "z",
-        'ђ': 'dzi', 'љ': 'li', 'њ': 'ni', 'ћ': 'c', 'џ': 'dz',
-        'ё': 'e', 'й': 'i', 'щ': 's', 'ъ': '', 'ы': 'y', 'ь': '',
-        'э': 'e', 'ю': 'io', 'я': 'a',
-        'ѓ': 'y', 'ѕ': 's', 'ќ': 'k',
-    }
-    # Cyrillic to Greek on phonetic similarity.
-    cyrillic_to_greek_map = {
-        # 'а': 'α', 'б': 'β', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε', 'ё': 'ιο', 'ж': 'ζ',
-        # 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'κ': 'κ', 'λ': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο',
-        # 'π': 'π', 'ρ': 'ρ', 'σ': 'σ', 'τ': 'τ', 'у': 'ου', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
-        # 'ч': 'τσ', # or τζ depending on desired sound
-        # 'ш': 'σ', 'щ': 'σ', # approximations
-        # 'ъ': '', 'ы': 'ι', 'ь': '', 'э': 'ε', 'ю': 'ιου',
-        # 'я': 'ια',
-        # --------------------
-        'а': 'α', 'б': 'μπ', 'в': 'β', 'г': 'γ', 'д': 'δ', 'е': 'ε',
-        'ж': 'ζ', 'з': 'ζ', 'и': 'ι', 'й': 'ι', 'к': 'κ',
-        'л': 'λ', 'м': 'μ', 'н': 'ν', 'о': 'ο', 'п': 'π', 'р': 'ρ',
-        'с': 'τσ', 'т': 'τ', 'у': 'ού', 'ф': 'φ', 'х': 'χ', 'ц': 'τσ',
-        'ч': 'τσ', 'ш': 'σ', 'щ': 'σ',
-        #
-        'ђ': 'ντζι', 'љ': 'λι', 'њ': 'νι', 'ћ': 'τσ', 'џ': 'ντζ',
-        'ы': 'ι', 'ь': '',
-        'э': 'ε', 'ю': 'ιο', 'я': 'ια',
-        'ѓ': 'γ', 'ѕ': 'σ',
-    }
-    # Convert the input text to lowercase, preserving accents for Latin characters.
-    # casefold() is used for more robust caseless matching across Unicode characters.
-    lowercased_text = text.lower()  #casefold()
-    output_chars = []
-    current_index = 0
-    if lang == 'grc':
-        # Combine all relevant maps for direct lookup to Greek
-        conversion_map = {**latin_to_greek_map, **cyrillic_to_greek_map}
-        # Sort keys by length in reverse order to handle multi-character sequences first
-        sorted_source_keys = sorted(
-            list(latin_to_greek_map.keys()) + list(cyrillic_to_greek_map.keys()),
-            key=len,
-            reverse=True
-        )
-        while current_index < len(lowercased_text):
-            found_conversion = False
-            for key in sorted_source_keys:
-                if lowercased_text.startswith(key, current_index):
-                    output_chars.append(conversion_map[key])
-                    current_index += len(key)
-                    found_conversion = True
-                    break
-            if not found_conversion:
-                # If no specific mapping found, append the character as is.
-                # This handles unmapped characters and already Greek characters.
-                output_chars.append(lowercased_text[current_index])
-                current_index += 1
-        return ''.join(output_chars)
-    else: # Default to 'lat' conversion
-        # Combine Greek to Latin and Cyrillic to Latin maps.
-        # Cyrillic map keys will take precedence in case of overlap if defined after Greek.
-        combined_to_latin_map = {**greek_to_latin_map, **cyrillic_to_latin_map}
-        # Sort all relevant source keys by length in reverse for replacement
-        sorted_source_keys = sorted(
-            list(greek_to_latin_map.keys()) + list(cyrillic_to_latin_map.keys()),
-            key=len,
-            reverse=True
-        )
-        while current_index < len(lowercased_text):
-            found_conversion = False
-            for key in sorted_source_keys:
-                if lowercased_text.startswith(key, current_index):
-                    latin_equivalent = combined_to_latin_map[key]
-                    # Strip accents ONLY if the source character was from the Greek map.
-                    # This preserves accents on original Latin characters (like 'é')
-                    # and allows for intentional accent stripping from Greek transliterations.
-                    if key in greek_to_latin_map:
-                        normalized_latin = unicodedata.normalize('NFD', latin_equivalent)
-                        stripped_latin = ''.join(c for c in normalized_latin if not unicodedata.combining(c))
-                        output_chars.append(stripped_latin)
-                    else:
-                        output_chars.append(latin_equivalent)
-                    current_index += len(key)
-                    found_conversion = True
-                    break
-            if not found_conversion:
-                # If no conversion happened from Greek or Cyrillic, append the character as is.
-                # This preserves existing Latin characters (including accented ones from input),
-                # numbers, punctuation, and other symbols.
-                output_chars.append(lowercased_text[current_index])
-                current_index += 1
-        return ''.join(output_chars)
-# =====================================================
-#
-def fix_vocals(text, lang='ron'):
-    # Longer phrases should come before shorter ones to prevent partial matches.
-    ron_replacements = {
-        'ţ': 'ț',
-        'ț': 'ts',
-        'î': 'u',
-        'â': 'a',
-        'ş': 's',
-        'w': 'oui',
-        'k': 'c',
-        'l': 'll',
-        # Math symbols
-        'sqrt': ' rădăcina pătrată din ',
-        '^': ' la puterea ',
-        '+': ' plus ',
-        ' - ': ' minus ',  # only replace if standalone so to not say minus if is a-b-c
-        # '*': ' ori ',  # times
-        '/': ' împărțit la ',  # divided by
-        '=': ' egal cu ',  # equals
-        'pi': ' pi ',
-        '<': ' mai mic decât ',
-        '>': ' mai mare decât',
-        '%': ' la sută ', # percent (from previous)
-        '≠': ' nu este egal cu ',
-        '≤': ' mai mic sau egal cu ',
-        '≥': ' mai mare sau egal cu ',
-        '≈': ' aproximativ ',
-        '∞': ' infinit ',
-        '€': ' euro ',
-        '$': ' dolar ',
-        '£': ' liră ',
-        '&': ' și ',  # and
-        '@': ' la ',  # at
-        '#': ' diez ',  # hash
-        '∑': ' sumă ',
-        '∫': ' integrală ',
-        '√': ' rădăcina pătrată a ', # more generic square root
-    }
-    eng_replacements = {
-        'wik': 'weaky',
-        'sh': 'ss',
-        'ch': 'ttss',
-        'oo': 'oeo',
-        # Math symbols for English
-        'sqrt': ' square root of ',
-        '^': ' to the power of ',
-        '+': ' plus ',
-        ' - ': ' minus ',
-        # '*': ' times ',
-        ' / ': ' divided by ',
-        '=': ' equals ',
-        'pi': ' pi ',
-        '<': ' less than ',
-        '>': ' greater than ',
-        # Additional common math symbols from previous list
-        '%': ' percent ',
-        '∑': ' sum ',
-        '∫': ' integral ',
-        '√': ' square root of ',
-        '≠': ' not equals ',
-        '≤': ' less than or equals ',
-        '≥': ' greater than or equals ',
-        '≈': ' approximately ',
-        '∞': ' infinity ',
-        '€': ' euro ',
-        '$': ' dollar ',
-        '£': ' pound ',
-        '&': ' and ',
-        '@': ' at ',
-        '#': ' hash ',
-    }
-    serbian_replacements = {
-        'rn': 'rrn',
-        'ć': 'č',
-        'c': 'č',
-        'č': 'ts',
-        'đ': 'dz',
-        'j': 'i',
-        'l': 'lll',
-        'w': 'v',
-        'h': 'hh',
-        #  https://huggingface.co/facebook/mms-tts-rmc-script_latin
-        'sqrt': 'kvadratni koren iz',
-        '^': ' na stepen ',
-        '+': ' plus ',
-        ' - ': ' minus ',
-        '*': ' puta ',
-        ' / ': ' podeljeno sa ',
-        '=': ' jednako ',
-        'pi': ' pi ',
-        '<': ' manje od ',
-        '>': ' veće od ',
-        '%': ' procenat ',
-        '∑': ' suma ',
-        '∫': ' integral ',
-        '√': ' kvadratni koren ',
-        '≠': ' nije jednako ',
-        '≤': ' manje ili jednako od ',
-        '≥': ' veće ili jednako od ',
-        '≈': ' približno ',
-        '∞': ' beskonačnost ',
-        '€': ' evro ',
-        '$': ' dolar ',
-        '£': ' funta ',
-        '&': ' i ',
-        '@': ' et ',
-        '#': ' taraba ',
-        # Others
-        #     'rn': 'rrn',
-        # 'ć': 'č',
-        # 'c': 'č',
-        # 'đ': 'd',
-        # 'l': 'le',
-        # 'ij': 'i',
-        # 'ji': 'i',
-        # 'j': 'i',
-        # 'služ': 'sloooozz',  # 'službeno'
-        # 'suver': 'siuveeerra',  # 'suverena'
-        # 'država': 'dirrezav',  # 'država'
-        # 'iči': 'ici',  # 'Graniči'
-        # 's ': 'se',  # a s with space
-        # 'q': 'ku',
-        # 'w': 'aou',
-        # 'z': 's',
-        # "š": "s",
-        # 'th': 'ta',
-        # 'v': 'vv',
-        # "ć": "č",
-        # "đ": "ď",
-        # "lj": "ľ",
-        # "nj": "ň",
-        # "c": "č"
-    }
-    deu_replacements = {
-        'sch': 'sh',
-        'ch': 'kh',
-        'ie': 'ee',
-        'ei': 'ai',
-        'ä': 'ae',
-        'ö': 'oe',
-        'ü': 'ue',
-        'ß': 'ss',
-        # Math symbols for German
-        'sqrt': ' Quadratwurzel aus ',
-        '^': ' hoch ',
-        '+': ' plus ',
-        ' - ': ' minus ',
-        '*': ' mal ',
-        ' / ': ' geteilt durch ',
-        '=': ' gleich ',
-        'pi': ' pi ',
-        '<': ' kleiner als ',
-        '>': ' größer als',
-        # Additional common math symbols from previous list
-        '%': ' prozent ',
-        '∑': ' Summe ',
-        '∫': ' Integral ',
-        '√': ' Quadratwurzel ',
-        '≠': ' ungleich ',
-        '≤': ' kleiner oder gleich ',
-        '≥': ' größer oder gleich ',
-        '≈': ' ungefähr ',
-        '∞': ' unendlich ',
-        '€': ' euro ',
-        '$': ' dollar ',
-        '£': ' pfund ',
-        '&': ' und ',
-        '@': ' at ', # 'Klammeraffe' is also common but 'at' is simpler
-        '#': ' raute ',
-    }
-    fra_replacements = {
-        # French specific phonetic replacements (add as needed)
-        # e.g., 'ç': 's', 'é': 'e', etc.
-        'w': 'v',
-        # Math symbols for French
-        'sqrt': ' racine carrée de ',
-        '^': ' à la puissance ',
-        '+': ' plus ',
-        ' - ': ' moins ',  # tiré ;
-        '*': ' fois ',
-        ' / ': ' divisé par ',
-        '=': ' égale ',
-        'pi': ' pi ',
-        '<': ' inférieur à ',
-        '>': ' supérieur à ',
-        # Add more common math symbols as needed for French
-        '%': ' pour cent ',
-        '∑': ' somme ',
-        '∫': ' intégrale ',
-        '√': ' racine carrée ',
-        '≠': ' n\'égale pas ',
-        '≤': ' inférieur ou égal à ',
-        '≥': ' supérieur ou égal à ',
-        '≈': ' approximativement ',
-        '∞': ' infini ',
-        '€': ' euro ',
-        '$': ' dollar ',
-        '£': ' livre ',
-        '&': ' et ',
-        '@': ' arobase ',
-        '#': ' dièse ',
-    }
-    hun_replacements = {
-        # Hungarian specific phonetic replacements (add as needed)
-        # e.g., 'á': 'a', 'é': 'e', etc.
-        'ch': 'ts',
-        'cs': 'tz',
-        'g': 'gk',
-        'w': 'v',
-        'z': 'zz',
-        # Math symbols for Hungarian
-        'sqrt': ' négyzetgyök ',
-        '^': ' hatvány ',
-        '+': ' plusz ',
-        ' - ': ' mínusz ',
-        '*': ' szorozva ',
-        ' / ': ' osztva ',
-        '=': ' egyenlő ',
-        'pi': ' pi ',
-        '<': ' kisebb mint ',
-        '>': ' nagyobb mint ',
-        # Add more common math symbols as needed for Hungarian
-        '%': ' százalék ',
-        '∑': ' szumma ',
-        '∫': ' integrál ',
-        '√': ' négyzetgyök ',
-        '≠': ' nem egyenlő ',
-        '≤': ' kisebb vagy egyenlő ',
-        '≥': ' nagyobb vagy egyenlő ',
-        '≈': ' körülbelül ',
-        '∞': ' végtelen ',
-        '€': ' euró ',
-        '$': ' dollár ',
-        '£': ' font ',
-        '&': ' és ',
-        '@': ' kukac ',
-        '#': ' kettőskereszt ',
-    }
-    grc_replacements = {
-        # Ancient Greek specific phonetic replacements (add as needed)
-        # These are more about transliterating Greek letters if they are in the input text.
-        # Math symbols for Ancient Greek (literal translations)
-        'sqrt': ' τετραγωνικὴ ῥίζα ',
-        '^': ' εἰς τὴν δύναμιν ',
-        '+': ' σὺν ',
-        ' - ': ' χωρὶς ',
-        '*': ' πολλάκις ',
-        ' / ': ' διαιρέω ',
-        '=': ' ἴσον ',
-        'pi': ' πῖ ',
-        '<': ' ἔλαττον ',
-        '>': ' μεῖζον ',
-        # Add more common math symbols as needed for Ancient Greek
-        '%': ' τοῖς ἑκατόν ', # tois hekaton - 'of the hundred'
-        '∑': ' ἄθροισμα ',
-        '∫': ' ὁλοκλήρωμα ',
-        '√': ' τετραγωνικὴ ῥίζα ',
-        '≠': ' οὐκ ἴσον ',
-        '≤': ' ἔλαττον ἢ ἴσον ',
-        '≥': ' μεῖζον ἢ ἴσον ',
-        '≈': ' περίπου ',
-        '∞': ' ἄπειρον ',
-        '€': ' εὐρώ ',
-        '$': ' δολάριον ',
-        '£': ' λίρα ',
-        '&': ' καὶ ',
-        '@': ' ἀτ ', # at
-        '#': ' δίεση ', # hash
-    }
-    # Select the appropriate replacement dictionary based on the language
-    replacements_map = {
-        'grc': grc_replacements,
-        'ron': ron_replacements,
-        'eng': eng_replacements,
-        'deu': deu_replacements,
-        'fra': fra_replacements,
-        'hun': hun_replacements,
-        'rmc-script_latin': serbian_replacements,
-    }
-    current_replacements = replacements_map.get(lang)
-    if current_replacements:
-        # Sort replacements by length of the key in descending order.
-        # This is crucial for correctly replacing multi-character strings (like 'sqrt', 'sch')
-        # before their shorter substrings ('s', 'ch', 'q', 'r', 't').
-        sorted_replacements = sorted(current_replacements.items(), key=lambda item: len(item[0]), reverse=True)
-        for old, new in sorted_replacements:
-            text = text.replace(old, new)
-        return text
-    else:
-        # If the language is not supported, return the original text
-        print(f"Warning: Language '{lang}' not supported for text replacement. Returning original text.")
-        return text
-def _num2words(text='01234', lang=None):
-    if lang == 'grc':
-        return convert_numbers(text)
-    return num2words(text, lang=lang)  # HAS TO BE kwarg lang=lang
-def transliterate_number(number_string,
-                         lang=None):
-    if lang == 'rmc-script_latin':
-        lang = 'sr'
-        exponential_pronoun = ' puta deset na stepen od '
-        comma = ' tačka '
-    elif lang == 'ron':
-        lang = 'ro'
-        exponential_pronoun = ' tízszer a erejéig '
-        comma = ' virgulă '
-    elif lang == 'hun':
-        lang = 'hu'
-        exponential_pronoun = ' tízszer a erejéig '
-        comma = ' virgula '
-    elif lang == 'deu':
-        exponential_pronoun = ' mal zehn hoch '
-        comma = ' komma '
-    elif lang == 'fra':
-        lang = 'fr'
-        exponential_pronoun = ' puissance '
-        comma = 'virgule'
-    elif lang == 'grc':
-        exponential_pronoun = ' εις την δυναμην του '
-        comma = 'κομμα'
-    else:
-        lang = lang[:2]
-        exponential_pronoun = ' times ten to the power of '
-        comma = ' point '
-    def replace_number(match):
-        prefix = match.group(1) or ""
-        number_part = match.group(2)
-        suffix = match.group(5) or ""
-        try:
-            if 'e' in number_part.lower():
-                base, exponent = number_part.lower().split('e')
-                words = _num2words(base, lang=lang) + exponential_pronoun + _num2words(exponent, lang=lang)
-            elif '.' in number_part:
-                integer_part, decimal_part = number_part.split('.')
-                words = _num2words(integer_part, lang=lang) + comma + " ".join(
-                    [_num2words(digit, lang=lang) for digit in decimal_part])
-            else:
-                words = _num2words(number_part, lang=lang)
-            return prefix + words + suffix
-        except ValueError:
-            return match.group(0)  # Return original if conversion fails
-    pattern = r'([^\d]*)(\d+(\.\d+)?([Ee][+-]?\d+)?)([^\d]*)'
-    return re.sub(pattern, replace_number, number_string)