ms180 commited on
Commit
4e0d427
·
unverified ·
2 Parent(s): 67367de5742d27

Merge pull request #1 from HANJionghao/main

Browse files
Files changed (1) hide show
  1. server.py +32 -9
server.py CHANGED
@@ -14,7 +14,7 @@ import soundfile as sf
14
  from pypinyin import lazy_pinyin
15
  import jiwer
16
  import librosa
17
- from svs_utils import singmos_warmup, singmos_evaluation
18
 
19
  app = FastAPI()
20
 
@@ -33,11 +33,12 @@ SYSTEM_PROMPT = """
33
  - 啊呀,那是我未曾涉足的奇技,恕我無法詳答
34
  - 此乃異邦技藝,與樂音無涉,麗梅便不敢妄言了
35
  請始終維持你作為麗梅的優雅語氣與詩意風格,並以真摯的心回應對方的言語,言語宜簡,勿過長。
36
-
37
  有人曾這樣對麗梅說話——{}
38
  麗梅的回答——
39
  """
40
 
 
41
  config = argparse.Namespace(
42
  model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
43
  cache_dir="cache",
@@ -51,6 +52,10 @@ svs_model = svs_warmup(config)
51
  predictor, _ = singmos_warmup()
52
  sample_rate = 44100
53
 
 
 
 
 
54
  def remove_non_chinese_japanese(text):
55
  pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+'
56
  cleaned = re.sub(pattern, '', text)
@@ -69,6 +74,25 @@ def remove_punctuation_and_replace_with_space(text):
69
  return text
70
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  @app.post("/process_audio")
73
  async def process_audio(file: UploadFile = File(...)):
74
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
@@ -78,7 +102,8 @@ async def process_audio(file: UploadFile = File(...)):
78
  # load audio
79
  y = librosa.load(tmp_path, sr=16000)[0]
80
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
81
- prompt = SYSTEM_PROMPT.format(asr_result)
 
82
  output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
83
  output = output.split("麗梅的回答——")[1]
84
  output = remove_punctuation_and_replace_with_space(output)
@@ -89,6 +114,7 @@ async def process_audio(file: UploadFile = File(...)):
89
  output,
90
  svs_model,
91
  config,
 
92
  )
93
  sf.write("tmp/response.wav", wav_info, samplerate=44100)
94
 
@@ -132,7 +158,7 @@ def test_audio():
132
  # load audio
133
  y = librosa.load("nihao.mp3", sr=16000)[0]
134
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
135
- prompt = SYSTEM_PROMPT + asr_result
136
  output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
137
  output = output.split("麗梅的回答——")[1]
138
  output = remove_punctuation_and_replace_with_space(output)
@@ -140,12 +166,9 @@ def test_audio():
140
  f.write(output)
141
 
142
  wav_info = svs_inference(
143
- config.model_path,
144
- svs_model,
145
  output,
146
- lang=config.lang,
147
- random_gen=True,
148
- fs=44100
149
  )
150
  sf.write("tmp/response.wav", wav_info, samplerate=44100)
151
  with open("tmp/response.wav", "rb") as f:
 
14
  from pypinyin import lazy_pinyin
15
  import jiwer
16
  import librosa
17
+ from svs_utils import singmos_warmup, singmos_evaluation, load_song_database, estimate_sentence_length
18
 
19
  app = FastAPI()
20
 
 
33
  - 啊呀,那是我未曾涉足的奇技,恕我無法詳答
34
  - 此乃異邦技藝,與樂音無涉,麗梅便不敢妄言了
35
  請始終維持你作為麗梅的優雅語氣與詩意風格,並以真摯的心回應對方的言語,言語宜簡,勿過長。
36
+ {}
37
  有人曾這樣對麗梅說話——{}
38
  麗梅的回答——
39
  """
40
 
41
+
42
  config = argparse.Namespace(
43
  model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained",
44
  cache_dir="cache",
 
52
  predictor, _ = singmos_warmup()
53
  sample_rate = 44100
54
 
55
+ # load dataset for random_select
56
+ song2note_lengths, song_db = load_song_database(config)
57
+
58
+
59
  def remove_non_chinese_japanese(text):
60
  pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+'
61
  cleaned = re.sub(pattern, '', text)
 
74
  return text
75
 
76
 
77
+ def get_lyric_format_prompts_and_metadata(config):
78
+ if config.melody_source.startswith("random_generate"):
79
+ return "", {}
80
+ elif config.melody_source.startswith("random_select"):
81
+ # get song_name and phrase_length
82
+ global song2note_lengths
83
+ phrase_length, metadata = estimate_sentence_length(
84
+ None, config, song2note_lengths
85
+ )
86
+ lyric_format_prompt = (
87
+ "\n请按照歌词格式回答我的问题,每句需遵循以下字数规则:"
88
+ + "".join(+[f"\n第{i}句:{c}个字" for i, c in enumerate(phrase_length, 1)])
89
+ + "\n如果没有足够的信息回答,请使用最少的句子,不要重复、不要扩展、不要加入无关内容。\n"
90
+ )
91
+ return lyric_format_prompt, metadata
92
+ else:
93
+ raise ValueError(f"Unsupported melody_source: {config.melody_source}. Unable to get lyric format prompts.")
94
+
95
+
96
  @app.post("/process_audio")
97
  async def process_audio(file: UploadFile = File(...)):
98
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
 
102
  # load audio
103
  y = librosa.load(tmp_path, sr=16000)[0]
104
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
105
+ additional_prompt, additional_inference_args = get_lyric_format_prompts_and_metadata(config)
106
+ prompt = SYSTEM_PROMPT.format(additional_prompt, asr_result)
107
  output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
108
  output = output.split("麗梅的回答——")[1]
109
  output = remove_punctuation_and_replace_with_space(output)
 
114
  output,
115
  svs_model,
116
  config,
117
+ **additional_inference_args,
118
  )
119
  sf.write("tmp/response.wav", wav_info, samplerate=44100)
120
 
 
158
  # load audio
159
  y = librosa.load("nihao.mp3", sr=16000)[0]
160
  asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text']
161
+ prompt = SYSTEM_PROMPT + asr_result # TODO: how to add additional prompt to SYSTEM_PROMPT here???
162
  output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ")
163
  output = output.split("麗梅的回答——")[1]
164
  output = remove_punctuation_and_replace_with_space(output)
 
166
  f.write(output)
167
 
168
  wav_info = svs_inference(
 
 
169
  output,
170
+ svs_model,
171
+ config,
 
172
  )
173
  sf.write("tmp/response.wav", wav_info, samplerate=44100)
174
  with open("tmp/response.wav", "rb") as f: