Spaces:
Sleeping
Sleeping
Add integrating random_select to server.py
Browse files
server.py
CHANGED
|
@@ -14,7 +14,7 @@ import soundfile as sf
|
|
| 14 |
from pypinyin import lazy_pinyin
|
| 15 |
import jiwer
|
| 16 |
import librosa
|
| 17 |
-
from svs_utils import singmos_warmup, singmos_evaluation
|
| 18 |
|
| 19 |
app = FastAPI()
|
| 20 |
|
|
@@ -33,11 +33,12 @@ SYSTEM_PROMPT = """
|
|
| 33 |
- 啊呀,那是我未曾涉足的奇技,恕我無法詳答
|
| 34 |
- 此乃異邦技藝,與樂音無涉,麗梅便不敢妄言了
|
| 35 |
請始終維持你作為麗梅的優雅語氣與詩意風格,並以真摯的心回應對方的言語,言語宜簡,勿過長。
|
| 36 |
-
|
| 37 |
有人曾這樣對麗梅說話——{}
|
| 38 |
麗梅的回答——
|
| 39 |
"""
|
| 40 |
|
|
|
|
| 41 |
config = argparse.Namespace(
|
| 42 |
model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
|
| 43 |
cache_dir="cache",
|
|
@@ -51,6 +52,9 @@ svs_model = svs_warmup(config)
|
|
| 51 |
predictor, _ = singmos_warmup()
|
| 52 |
sample_rate = 44100
|
| 53 |
|
|
|
|
|
|
|
|
|
|
| 54 |
def remove_non_chinese_japanese(text):
|
| 55 |
pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+'
|
| 56 |
cleaned = re.sub(pattern, '', text)
|
|
@@ -69,6 +73,24 @@ def remove_punctuation_and_replace_with_space(text):
|
|
| 69 |
return text
|
| 70 |
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
@app.post("/process_audio")
|
| 73 |
async def process_audio(file: UploadFile = File(...)):
|
| 74 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
|
@@ -78,7 +100,8 @@ async def process_audio(file: UploadFile = File(...)):
|
|
| 78 |
# load audio
|
| 79 |
y = librosa.load(tmp_path, sr=16000)[0]
|
| 80 |
asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
|
| 81 |
-
|
|
|
|
| 82 |
output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
|
| 83 |
output = output.split("麗梅的回答——")[1]
|
| 84 |
output = remove_punctuation_and_replace_with_space(output)
|
|
@@ -89,6 +112,7 @@ async def process_audio(file: UploadFile = File(...)):
|
|
| 89 |
output,
|
| 90 |
svs_model,
|
| 91 |
config,
|
|
|
|
| 92 |
)
|
| 93 |
sf.write("tmp/response.wav", wav_info, samplerate=44100)
|
| 94 |
|
|
@@ -132,7 +156,7 @@ def test_audio():
|
|
| 132 |
# load audio
|
| 133 |
y = librosa.load("nihao.mp3", sr=16000)[0]
|
| 134 |
asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
|
| 135 |
-
prompt = SYSTEM_PROMPT + asr_result
|
| 136 |
output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
|
| 137 |
output = output.split("麗梅的回答——")[1]
|
| 138 |
output = remove_punctuation_and_replace_with_space(output)
|
|
@@ -140,12 +164,9 @@ def test_audio():
|
|
| 140 |
f.write(output)
|
| 141 |
|
| 142 |
wav_info = svs_inference(
|
| 143 |
-
config.model_path,
|
| 144 |
-
svs_model,
|
| 145 |
output,
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
fs=44100
|
| 149 |
)
|
| 150 |
sf.write("tmp/response.wav", wav_info, samplerate=44100)
|
| 151 |
with open("tmp/response.wav", "rb") as f:
|
|
|
|
| 14 |
from pypinyin import lazy_pinyin
|
| 15 |
import jiwer
|
| 16 |
import librosa
|
| 17 |
+
from svs_utils import singmos_warmup, singmos_evaluation, load_song_database, estimate_sentence_length
|
| 18 |
|
| 19 |
app = FastAPI()
|
| 20 |
|
|
|
|
| 33 |
- 啊呀,那是我未曾涉足的奇技,恕我無法詳答
|
| 34 |
- 此乃異邦技藝,與樂音無涉,麗梅便不敢妄言了
|
| 35 |
請始終維持你作為麗梅的優雅語氣與詩意風格,並以真摯的心回應對方的言語,言語宜簡,勿過長。
|
| 36 |
+
{}
|
| 37 |
有人曾這樣對麗梅說話——{}
|
| 38 |
麗梅的回答——
|
| 39 |
"""
|
| 40 |
|
| 41 |
+
|
| 42 |
config = argparse.Namespace(
|
| 43 |
model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
|
| 44 |
cache_dir="cache",
|
|
|
|
| 52 |
predictor, _ = singmos_warmup()
|
| 53 |
sample_rate = 44100
|
| 54 |
|
| 55 |
+
# load dataset for random_select
|
| 56 |
+
song2note_lengths, song_db = load_song_database(config)
|
| 57 |
+
|
| 58 |
def remove_non_chinese_japanese(text):
|
| 59 |
pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+'
|
| 60 |
cleaned = re.sub(pattern, '', text)
|
|
|
|
| 73 |
return text
|
| 74 |
|
| 75 |
|
| 76 |
+
def get_lyric_format_prompts_and_metadata(config):
|
| 77 |
+
if config.melody_source.startswith("random_generate"):
|
| 78 |
+
return "", {}
|
| 79 |
+
elif config.melody_source.startswith("random_select"):
|
| 80 |
+
# get song_name and phrase_length
|
| 81 |
+
global song2note_lengths
|
| 82 |
+
phrase_length, metadata = estimate_sentence_length(
|
| 83 |
+
None, config, song2note_lengths
|
| 84 |
+
)
|
| 85 |
+
LYRIC_FORMAT_PROMPT = "".join(
|
| 86 |
+
["\n请按照歌词格式回答我的问题,每句需遵循以下字数规则:"]
|
| 87 |
+
+ [f"\n第{i}句:{c}个字" for i, c in enumerate(phrase_length, 1)]
|
| 88 |
+
) + "\n"
|
| 89 |
+
return LYRIC_FORMAT_PROMPT, metadata
|
| 90 |
+
else:
|
| 91 |
+
raise ValueError(f"Unsupported melody_source: {config.melody_source}. Unable to get lyric format prompts.")
|
| 92 |
+
|
| 93 |
+
|
| 94 |
@app.post("/process_audio")
|
| 95 |
async def process_audio(file: UploadFile = File(...)):
|
| 96 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
|
|
|
| 100 |
# load audio
|
| 101 |
y = librosa.load(tmp_path, sr=16000)[0]
|
| 102 |
asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
|
| 103 |
+
additional_prompt, additional_inference_args = get_lyric_format_prompts_and_metadata(config)
|
| 104 |
+
prompt = SYSTEM_PROMPT.format(additional_prompt, asr_result)
|
| 105 |
output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
|
| 106 |
output = output.split("麗梅的回答——")[1]
|
| 107 |
output = remove_punctuation_and_replace_with_space(output)
|
|
|
|
| 112 |
output,
|
| 113 |
svs_model,
|
| 114 |
config,
|
| 115 |
+
**additional_inference_args,
|
| 116 |
)
|
| 117 |
sf.write("tmp/response.wav", wav_info, samplerate=44100)
|
| 118 |
|
|
|
|
| 156 |
# load audio
|
| 157 |
y = librosa.load("nihao.mp3", sr=16000)[0]
|
| 158 |
asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
|
| 159 |
+
prompt = SYSTEM_PROMPT + asr_result # TODO: how to add additional prompt to SYSTEM_PROMPT here???
|
| 160 |
output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
|
| 161 |
output = output.split("麗梅的回答——")[1]
|
| 162 |
output = remove_punctuation_and_replace_with_space(output)
|
|
|
|
| 164 |
f.write(output)
|
| 165 |
|
| 166 |
wav_info = svs_inference(
|
|
|
|
|
|
|
| 167 |
output,
|
| 168 |
+
svs_model,
|
| 169 |
+
config,
|
|
|
|
| 170 |
)
|
| 171 |
sf.write("tmp/response.wav", wav_info, samplerate=44100)
|
| 172 |
with open("tmp/response.wav", "rb") as f:
|