trial live demo
Browse files- live_api.py +172 -0
- live_demo.py +173 -0
live_api.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
import numpy as np
|
| 4 |
+
import soundfile
|
| 5 |
+
import audresample
|
| 6 |
+
import text_utils
|
| 7 |
+
import msinference
|
| 8 |
+
import re
|
| 9 |
+
import srt
|
| 10 |
+
import subprocess
|
| 11 |
+
import markdown
|
| 12 |
+
import json
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from types import SimpleNamespace
|
| 15 |
+
from flask import Flask, request, send_from_directory
|
| 16 |
+
from flask_cors import CORS
|
| 17 |
+
from audiocraft.audiogen import AudioGen, audio_write
|
| 18 |
+
|
| 19 |
+
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
| 20 |
+
sound_generator.set_generation_params(duration=6)
|
| 21 |
+
|
| 22 |
+
CACHE_DIR = 'flask_cache/'
|
| 23 |
+
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _shift(x):
|
| 27 |
+
n = x.shape[0]
|
| 28 |
+
i = np.random.randint(.24 * n, max(1, .74 * n)) # high should be above >= 0
|
| 29 |
+
x = np.roll(x, i)
|
| 30 |
+
# fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4))
|
| 31 |
+
# x = x * fade_in
|
| 32 |
+
return x
|
| 33 |
+
|
| 34 |
+
def overlay(x, sound_background=None):
|
| 35 |
+
if sound_background is not None:
|
| 36 |
+
sound_background = sound_background.detach().cpu().numpy()[0, :]
|
| 37 |
+
len_speech = len(x)
|
| 38 |
+
if len_speech > len(sound_background):
|
| 39 |
+
n_repeat = len_speech // len(sound_background) + 1
|
| 40 |
+
replica = [sound_background] * n_repeat
|
| 41 |
+
replica = [_shift(_) for _ in replica]
|
| 42 |
+
sound_background = np.concatenate(replica)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
|
| 46 |
+
x = .74 * x + .26 * sound_background[:len_speech]
|
| 47 |
+
return x
|
| 48 |
+
|
| 49 |
+
def tts_multi_sentence(precomputed_style_vector=None,
|
| 50 |
+
text=None,
|
| 51 |
+
voice=None,
|
| 52 |
+
scene=None):
|
| 53 |
+
'''create 24kHZ np.array with tts
|
| 54 |
+
|
| 55 |
+
precomputed_style_vector : required if en_US or en_UK in voice, so
|
| 56 |
+
to perform affective TTS.
|
| 57 |
+
text : string
|
| 58 |
+
voice : string or None (falls to styleTTS)
|
| 59 |
+
scene : 'A castle in far away lands' -> if passed will generate background sound scene
|
| 60 |
+
'''
|
| 61 |
+
# Generate sound scene - up sample to 24KHz
|
| 62 |
+
if scene is not None:
|
| 63 |
+
|
| 64 |
+
sound_background = sound_generator.generate([scene])[0]
|
| 65 |
+
sound_background = audio_write(None,
|
| 66 |
+
sound_background.cpu(),
|
| 67 |
+
24000, # sound_generator.sample_rate,
|
| 68 |
+
strategy="loudness",
|
| 69 |
+
loudness_compressor=True)
|
| 70 |
+
else:
|
| 71 |
+
sound_background = None
|
| 72 |
+
|
| 73 |
+
# StyleTTS2
|
| 74 |
+
if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
|
| 75 |
+
assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
|
| 76 |
+
x = []
|
| 77 |
+
for _sentence in text:
|
| 78 |
+
x.append(msinference.inference(_sentence,
|
| 79 |
+
precomputed_style_vector,
|
| 80 |
+
alpha=0.3,
|
| 81 |
+
beta=0.7,
|
| 82 |
+
diffusion_steps=7,
|
| 83 |
+
embedding_scale=1))
|
| 84 |
+
x = np.concatenate(x)
|
| 85 |
+
|
| 86 |
+
return overlay(x, sound_background)
|
| 87 |
+
|
| 88 |
+
# Fallback - Mimic-3
|
| 89 |
+
text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences
|
| 90 |
+
ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True)
|
| 91 |
+
ps.wait()
|
| 92 |
+
x, fs = soundfile.read('_tmp.wav')
|
| 93 |
+
x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
|
| 94 |
+
|
| 95 |
+
return overlay(x, sound_background)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
# voices = {}
|
| 101 |
+
# import phonemizer
|
| 102 |
+
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
|
| 103 |
+
|
| 104 |
+
app = Flask(__name__)
|
| 105 |
+
cors = CORS(app)
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
@app.route("/")
|
| 109 |
+
def index():
|
| 110 |
+
with open('README.md', 'r') as f:
|
| 111 |
+
return markdown.markdown(f.read())
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
@app.route("/", methods=['GET', 'POST', 'PUT'])
|
| 115 |
+
def serve_wav():
|
| 116 |
+
# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
|
| 117 |
+
# object-into-a-representation-suitable-for-mongodb
|
| 118 |
+
r = request.form.to_dict(flat=False)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
args = SimpleNamespace(
|
| 122 |
+
text=None if r.get('text') is None else r.get('text'), # string not file?
|
| 123 |
+
voice=r.get('voice')[0],
|
| 124 |
+
native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
|
| 125 |
+
affective = r.get('affective')[0],
|
| 126 |
+
scene=r.get('scene')[0]
|
| 127 |
+
)
|
| 128 |
+
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
print(args, 'ENTER Script')
|
| 132 |
+
do_video_dub = False
|
| 133 |
+
|
| 134 |
+
# ====STYLE VECTOR====
|
| 135 |
+
|
| 136 |
+
precomputed_style_vector = None
|
| 137 |
+
# NOTE: style vector may be None
|
| 138 |
+
|
| 139 |
+
if precomputed_style_vector is None:
|
| 140 |
+
if 'en_US' in args.voice or 'en_UK' in args.voice:
|
| 141 |
+
_dir = '/' if args.affective else '_v2/'
|
| 142 |
+
precomputed_style_vector = msinference.compute_style(
|
| 143 |
+
'assets/wavs/style_vector' + _dir + args.voice.replace(
|
| 144 |
+
'/', '_').replace(
|
| 145 |
+
'#', '_').replace(
|
| 146 |
+
'cmu-arctic', 'cmu_arctic').replace(
|
| 147 |
+
'_low', '') + '.wav')
|
| 148 |
+
print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
x = tts_multi_sentence(text=args.text,
|
| 154 |
+
precomputed_style_vector=precomputed_style_vector,
|
| 155 |
+
voice=args.voice,
|
| 156 |
+
scene=args.scene)
|
| 157 |
+
OUT_FILE = 'tmp.wav'
|
| 158 |
+
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
# send server's output as default file -> srv_result.xx
|
| 165 |
+
print(f'\n=SERVER saved as {OUT_FILE=}\n')
|
| 166 |
+
response = send_from_directory(CACHE_DIR, path=OUT_FILE)
|
| 167 |
+
response.headers['suffix-file-type'] = OUT_FILE
|
| 168 |
+
return response
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
app.run(host="0.0.0.0")
|
live_demo.py
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import argparse
|
| 3 |
+
import os
|
| 4 |
+
import requests
|
| 5 |
+
import subprocess
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
# SSH AGENT
|
| 9 |
+
# eval $(ssh-agent -s)
|
| 10 |
+
# ssh-add ~/.ssh/id_ed25519_github2024
|
| 11 |
+
#
|
| 12 |
+
# git remote set-url origin git@github.com:audeering/shift
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
# https://stackoverflow.com/questions/57158779/how-to-stop-audio-with-playsound-module
|
| 17 |
+
# import multiprocessing
|
| 18 |
+
# from playsound import playsound
|
| 19 |
+
|
| 20 |
+
# p = multiprocessing.Process(target=playsound, args=("file.mp3",))
|
| 21 |
+
# p.start()
|
| 22 |
+
# input("press ENTER to stop playback")
|
| 23 |
+
# p.terminate()
|
| 24 |
+
# from playsound import playsound
|
| 25 |
+
# playsound('/path/to/a/sound/file/you/want/to/play.mp3')
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def command_line_args():
|
| 30 |
+
parser = argparse.ArgumentParser(
|
| 31 |
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
| 32 |
+
)
|
| 33 |
+
parser.add_argument(
|
| 34 |
+
'--affective',
|
| 35 |
+
help="Select Emotional or non-emotional variant of Available voices: https://audeering.github.io/shift/",
|
| 36 |
+
action='store_false',
|
| 37 |
+
)
|
| 38 |
+
parser.add_argument(
|
| 39 |
+
'--device',
|
| 40 |
+
help="Device ID",
|
| 41 |
+
type=str,
|
| 42 |
+
default='cpu',
|
| 43 |
+
)
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
'--text',
|
| 46 |
+
help="Text to be synthesized.",
|
| 47 |
+
default='sample.txt',
|
| 48 |
+
type=str,
|
| 49 |
+
)
|
| 50 |
+
parser.add_argument(
|
| 51 |
+
'--native',
|
| 52 |
+
help="""
|
| 53 |
+
--native: (without argument) a flag to do voice cloning using the speech from --video,
|
| 54 |
+
--native my_voice.wav: Voice cloning from user provided audio""",
|
| 55 |
+
# nargs='?',
|
| 56 |
+
# const=None,
|
| 57 |
+
# default=False # default has to be none
|
| 58 |
+
)
|
| 59 |
+
parser.add_argument(
|
| 60 |
+
'--voice',
|
| 61 |
+
help="TTS voice - Available voices: https://audeering.github.io/shift/",
|
| 62 |
+
default="en_US/m-ailabs_low#judy_bieber", #'en_US/cmu-arctic_low#lnh',
|
| 63 |
+
type=str,
|
| 64 |
+
)
|
| 65 |
+
parser.add_argument(
|
| 66 |
+
'--image',
|
| 67 |
+
help="If provided is set as background for output video, see --text",
|
| 68 |
+
type=str,
|
| 69 |
+
)
|
| 70 |
+
parser.add_argument(
|
| 71 |
+
'--video',
|
| 72 |
+
help="Video file for video translation. Voice cloned from the video",
|
| 73 |
+
type=str,
|
| 74 |
+
)
|
| 75 |
+
parser.add_argument(
|
| 76 |
+
'--out_file',
|
| 77 |
+
help="Output file name.",
|
| 78 |
+
type=str,
|
| 79 |
+
default='b6'
|
| 80 |
+
)
|
| 81 |
+
parser.add_argument(
|
| 82 |
+
'--scene',
|
| 83 |
+
help='Sound scene description.',
|
| 84 |
+
type=str,
|
| 85 |
+
default='calm background sounds of a castle'
|
| 86 |
+
)
|
| 87 |
+
return parser
|
| 88 |
+
|
| 89 |
+
def send_to_server(args):
|
| 90 |
+
url = "http://192.168.88.209:5000"
|
| 91 |
+
|
| 92 |
+
payload = {
|
| 93 |
+
'affective': args.affective,
|
| 94 |
+
'voice': args.voice,
|
| 95 |
+
'native': args.native,
|
| 96 |
+
'text': args.text,
|
| 97 |
+
'image': args.image,
|
| 98 |
+
'video': args.video,
|
| 99 |
+
'scene': args.scene,
|
| 100 |
+
# 'out_file': args.out_file # let serve save as temp
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
# In data= we can write args
|
| 104 |
+
|
| 105 |
+
# In files= sent actual files if provided
|
| 106 |
+
text_file = open(args.text, 'rb')
|
| 107 |
+
|
| 108 |
+
image_file, video_file, native_file = None, None, None
|
| 109 |
+
if args.image is not None:
|
| 110 |
+
print('\nLOADING IMAGE\n')
|
| 111 |
+
try:
|
| 112 |
+
image_file = open(args.image, 'rb')
|
| 113 |
+
except FileNotFoundError:
|
| 114 |
+
pass
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
if args.video is not None:
|
| 118 |
+
print('\nLOADING vid\n')
|
| 119 |
+
try:
|
| 120 |
+
video_file = open(args.video, 'rb')
|
| 121 |
+
except FileNotFoundError:
|
| 122 |
+
pass
|
| 123 |
+
|
| 124 |
+
if args.native is not None:
|
| 125 |
+
print('\nLOADING natv\n')
|
| 126 |
+
try:
|
| 127 |
+
native_file = open(args.native, 'rb')
|
| 128 |
+
except FileNotFoundError:
|
| 129 |
+
pass
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
# --------------------- send this extra
|
| 134 |
+
|
| 135 |
+
print('Sending...\n')
|
| 136 |
+
|
| 137 |
+
response = requests.post(url, data=payload,
|
| 138 |
+
files=[(args.image, image_file)]) # NONEs do not arrive to servers dict
|
| 139 |
+
|
| 140 |
+
# Check the response from the server
|
| 141 |
+
if response.status_code == 200:
|
| 142 |
+
print("\nRequest was successful!")
|
| 143 |
+
# print("Response:", respdonse.__dict__.keys(), '\n=====\n')
|
| 144 |
+
|
| 145 |
+
else:
|
| 146 |
+
print("Failed to send the request")
|
| 147 |
+
print("Status Code:", response.status_code)
|
| 148 |
+
print("Response:", response.text)
|
| 149 |
+
return response
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def cli(): # args.out_file is not send to server - server writes tmp - copied by client
|
| 153 |
+
parser = command_line_args()
|
| 154 |
+
args = parser.parse_args()
|
| 155 |
+
while True:
|
| 156 |
+
args.text = input("Type your text: ")
|
| 157 |
+
response = send_to_server(args)
|
| 158 |
+
out_file = args.out_file + '.' + response.headers['suffix-file-type'].split('.')[-1]
|
| 159 |
+
|
| 160 |
+
with open(out_file, 'wb') as f:
|
| 161 |
+
f.write(response.content)
|
| 162 |
+
print('REsponse AT client []\n----------------------------', response.headers)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
subprocess.run(["paplay", out_file])
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
if __name__ == '__main__':
|
| 170 |
+
cli()
|
| 171 |
+
|
| 172 |
+
# assume also video and text for video we have to write some classes for video for audiocraft
|
| 173 |
+
# then call tts.py on this video with nonempty labels - thus calls audiocraft
|