Spaces:
Running
Running
expo
Browse files- app.py +5 -8
- audiocraft.py +11 -5
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -38,7 +38,7 @@ language_names = ['Ancient greek',
|
|
| 38 |
|
| 39 |
def audionar_tts(text=None,
|
| 40 |
lang='Romanian',
|
| 41 |
-
soundscape='',
|
| 42 |
cache_lim=24):
|
| 43 |
|
| 44 |
# https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
|
|
@@ -62,9 +62,7 @@ def audionar_tts(text=None,
|
|
| 62 |
|
| 63 |
x = np.zeros(4 * 16000, dtype=np.float32) # If no txt 4s of audiogen
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
if lang not in language_names: # StyleTTS2
|
| 68 |
|
| 69 |
text = only_greek_or_only_latin(text, lang='eng')
|
| 70 |
|
|
@@ -77,7 +75,7 @@ def audionar_tts(text=None,
|
|
| 77 |
original_rate=24000,
|
| 78 |
target_rate=16000)[0, :] # 16 KHz
|
| 79 |
|
| 80 |
-
|
| 81 |
|
| 82 |
lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
|
| 83 |
|
|
@@ -254,7 +252,6 @@ VOICES = [
|
|
| 254 |
|
| 255 |
_tts = StyleTTS2().to('cpu')
|
| 256 |
|
| 257 |
-
|
| 258 |
with gr.Blocks(theme='huggingface') as demo:
|
| 259 |
with gr.Row():
|
| 260 |
text_input = gr.Textbox(
|
|
@@ -264,9 +261,9 @@ with gr.Blocks(theme='huggingface') as demo:
|
|
| 264 |
value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
|
| 265 |
)
|
| 266 |
choice_dropdown = gr.Dropdown(
|
| 267 |
-
choices=
|
| 268 |
label="Vox",
|
| 269 |
-
value=
|
| 270 |
)
|
| 271 |
soundscape_input = gr.Textbox(
|
| 272 |
lines=1,
|
|
|
|
| 38 |
|
| 39 |
def audionar_tts(text=None,
|
| 40 |
lang='Romanian',
|
| 41 |
+
soundscape='frogs',
|
| 42 |
cache_lim=24):
|
| 43 |
|
| 44 |
# https://huggingface.co/dkounadis/artificial-styletts2/blob/main/msinference.py
|
|
|
|
| 62 |
|
| 63 |
x = np.zeros(4 * 16000, dtype=np.float32) # If no txt 4s of audiogen
|
| 64 |
|
| 65 |
+
elif lang not in language_names: # text exists / StyleTTS2
|
|
|
|
|
|
|
| 66 |
|
| 67 |
text = only_greek_or_only_latin(text, lang='eng')
|
| 68 |
|
|
|
|
| 75 |
original_rate=24000,
|
| 76 |
target_rate=16000)[0, :] # 16 KHz
|
| 77 |
|
| 78 |
+
else: # VITS
|
| 79 |
|
| 80 |
lang_code = lang_map.get(lang.lower(), lang.lower().split()[0].strip())
|
| 81 |
|
|
|
|
| 252 |
|
| 253 |
_tts = StyleTTS2().to('cpu')
|
| 254 |
|
|
|
|
| 255 |
with gr.Blocks(theme='huggingface') as demo:
|
| 256 |
with gr.Row():
|
| 257 |
text_input = gr.Textbox(
|
|
|
|
| 261 |
value='Η γρηγορη καφετι αλεπου πειδαει πανω απο τον τεμπελη σκυλο.',
|
| 262 |
)
|
| 263 |
choice_dropdown = gr.Dropdown(
|
| 264 |
+
choices=VOICES + language_names,
|
| 265 |
label="Vox",
|
| 266 |
+
value=VOICES[0]
|
| 267 |
)
|
| 268 |
soundscape_input = gr.Textbox(
|
| 269 |
lines=1,
|
audiocraft.py
CHANGED
|
@@ -459,10 +459,16 @@ class LMModel(nn.Module):
|
|
| 459 |
logits = torch.stack([self.linears[k](out) for k in range(n_q)], dim=1) # [2*bs, 4, 1, 2048]
|
| 460 |
logits = 3 * logits[:bs, :, :, :] - self._scale * logits[bs:, :, :, :] # [ bs, 4, n_draw, 2048]
|
| 461 |
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 466 |
|
| 467 |
@torch.no_grad()
|
| 468 |
def generate(self,
|
|
@@ -718,7 +724,7 @@ class StreamingTransformer(nn.Module):
|
|
| 718 |
|
| 719 |
if __name__ == '__main__':
|
| 720 |
|
| 721 |
-
import audiofile
|
| 722 |
model = AudioGen().to('cpu')
|
| 723 |
x = model.generate(prompt='swims in lake frogs', duration=6.4).cpu().numpy()
|
| 724 |
audiofile.write('_sound_.wav', x, 16000)
|
|
|
|
| 459 |
logits = torch.stack([self.linears[k](out) for k in range(n_q)], dim=1) # [2*bs, 4, 1, 2048]
|
| 460 |
logits = 3 * logits[:bs, :, :, :] - self._scale * logits[bs:, :, :, :] # [ bs, 4, n_draw, 2048]
|
| 461 |
|
| 462 |
+
k = 24
|
| 463 |
+
logits = torch.softmax(logits / 1.0, dim=3) # [bs, 4, 1, 2048]
|
| 464 |
+
p, ix = torch.topk(logits, k, dim=3) # p = [bs, 4, 1, 24], ix = [bs, 4, 1, 2048]
|
| 465 |
+
# Exponential Distribution
|
| 466 |
+
deflation = torch.empty_like(p).exponential_(lambd=1)
|
| 467 |
+
p = p / deflation
|
| 468 |
+
# divide large probs with exp(prob) If prob=.001 then 1/exp(1*.001) -> almost by 0 --> exp doesnt really produce (0, Inf)
|
| 469 |
+
p = p.argmax(dim=3, keepdim=True) # [bs, 4, n_draw, 24]
|
| 470 |
+
tok = ix.gather(dim=3, index=p).to(torch.int64) # [bs, 4, n_draw, 1]
|
| 471 |
+
return tok[:, :, :, 0].transpose(1, 2) # [bs, n_draw, 4]
|
| 472 |
|
| 473 |
@torch.no_grad()
|
| 474 |
def generate(self,
|
|
|
|
| 724 |
|
| 725 |
if __name__ == '__main__':
|
| 726 |
|
| 727 |
+
import audiofile # pip uninstall flash-attn
|
| 728 |
model = AudioGen().to('cpu')
|
| 729 |
x = model.generate(prompt='swims in lake frogs', duration=6.4).cpu().numpy()
|
| 730 |
audiofile.write('_sound_.wav', x, 16000)
|
requirements.txt
CHANGED
|
@@ -9,7 +9,7 @@ numpy<2.0.0
|
|
| 9 |
gradio==5.27.0
|
| 10 |
Numbers2Words-Greek
|
| 11 |
einops
|
| 12 |
-
torch
|
| 13 |
pydantic==2.10.6
|
| 14 |
transformers==4.49.0
|
| 15 |
sentencepiece
|
|
|
|
| 9 |
gradio==5.27.0
|
| 10 |
Numbers2Words-Greek
|
| 11 |
einops
|
| 12 |
+
torch==2.1.0
|
| 13 |
pydantic==2.10.6
|
| 14 |
transformers==4.49.0
|
| 15 |
sentencepiece
|