Somalitts commited on
Commit
8c275c0
·
verified ·
1 Parent(s): 4700dbc

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +139 -0
app.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import soundfile as sf
4
+ import spaces
5
+ import os
6
+ import numpy as np
7
+ import re
8
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
+ from speechbrain.pretrained import EncoderClassifier
10
+ from datasets import load_dataset
11
+
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+
14
+ def load_models_and_data():
15
+ model_name = "microsoft/speecht5_tts"
16
+ processor = SpeechT5Processor.from_pretrained(model_name)
17
+ model = SpeechT5ForTextToSpeech.from_pretrained("emirhanbilgic/speecht5_finetuned_emirhan_tr").to(device)
18
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
19
+
20
+ spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
21
+ speaker_model = EncoderClassifier.from_hparams(
22
+ source=spk_model_name,
23
+ run_opts={"device": device},
24
+ savedir=os.path.join("/tmp", spk_model_name),
25
+ )
26
+
27
+ # Load a sample from a dataset for default embedding
28
+ dataset = load_dataset("erenfazlioglu/turkishvoicedataset", split="train")
29
+ example = dataset[304]
30
+
31
+ return model, processor, vocoder, speaker_model, example
32
+
33
+ model, processor, vocoder, speaker_model, default_example = load_models_and_data()
34
+
35
+ def create_speaker_embedding(waveform):
36
+ with torch.no_grad():
37
+ speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform).unsqueeze(0).to(device))
38
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
39
+ speaker_embeddings = speaker_embeddings.squeeze()
40
+ return speaker_embeddings
41
+
42
+ def prepare_default_embedding(example):
43
+ audio = example["audio"]
44
+ return create_speaker_embedding(audio["array"])
45
+
46
+ default_embedding = prepare_default_embedding(default_example)
47
+
48
+
49
+ number_words = {
50
+ 0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
51
+ 6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
52
+ 11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
53
+ 14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
54
+ 17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
55
+ 20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
56
+ 60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
57
+ 100: "boqol", 1000: "kun",
58
+ }
59
+
60
+ def number_to_words(number):
61
+ if number < 20:
62
+ return number_words[number]
63
+ elif number < 100:
64
+ tens, unit = divmod(number, 10)
65
+ return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
66
+ elif number < 1000:
67
+ hundreds, remainder = divmod(number, 100)
68
+ return (number_words[hundreds] + " boqol" if hundreds > 1 else "BOQOL") + (" " + number_to_words(remainder) if remainder else "")
69
+ elif number < 1000000:
70
+ thousands, remainder = divmod(number, 1000)
71
+ return (number_to_words(thousands) + " kun" if thousands > 1 else "KUN") + (" " + number_to_words(remainder) if remainder else "")
72
+ elif number < 1000000000:
73
+ millions, remainder = divmod(number, 1000000)
74
+ return number_to_words(millions) + " malyan" + (" " + number_to_words(remainder) if remainder else "")
75
+ elif number < 1000000000000:
76
+ billions, remainder = divmod(number, 1000000000)
77
+ return number_to_words(billions) + " milyaar" + (" " + number_to_words(remainder) if remainder else "")
78
+ else:
79
+ return str(number)
80
+
81
+ def replace_numbers_with_words(text):
82
+
83
+ def replace(match):
84
+ number = int(match.group())
85
+ return number_to_words(number)
86
+
87
+ # Find the numbers and change with words.
88
+ result = re.sub(r'\b\d+\b', replace, text)
89
+
90
+ return result
91
+
92
+ def normalize_text(text):
93
+ # Convert to lowercase
94
+ text = text.lower()
95
+
96
+ # Replace numbers with words
97
+ text = replace_numbers_with_words(text)
98
+
99
+ # Apply character replacements
100
+ for old, new in replacements:
101
+ text = text.replace(old, new)
102
+
103
+ # Remove punctuation
104
+ text = re.sub(r'[^\w\s]', '', text)
105
+
106
+ return text
107
+
108
+ @spaces.GPU(duration=60)
109
+ def text_to_speech(text, audio_file=None):
110
+ # Normalize the input text
111
+ normalized_text = normalize_text(text)
112
+
113
+ # Prepare the input for the model
114
+ inputs = processor(text=normalized_text, return_tensors="pt").to(device)
115
+
116
+ # Use the default speaker embedding
117
+ speaker_embeddings = default_embedding
118
+
119
+ # Generate speech
120
+ with torch.no_grad():
121
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings.unsqueeze(0), vocoder=vocoder)
122
+
123
+ speech_np = speech.cpu().numpy()
124
+
125
+ return (16000, speech_np)
126
+
127
+ iface = gr.Interface(
128
+ fn=text_to_speech,
129
+ inputs=[
130
+ gr.Textbox(label="soo gali somali language")
131
+ ],
132
+ outputs=[
133
+ gr.Audio(label="Generated Speech", type="numpy")
134
+ ],
135
+ title="soomaali",
136
+ description="soomaal lnaguage."
137
+ )
138
+
139
+ iface.launch(share=True)