Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, File, UploadFile | |
| from fastapi.responses import FileResponse, JSONResponse | |
| import base64 | |
| import argparse | |
| import librosa | |
| import torch | |
| import tempfile | |
| import os | |
| from transformers import pipeline | |
| import re | |
| from svs_utils import svs_warmup, svs_inference | |
| import time | |
| import soundfile as sf | |
| from pypinyin import lazy_pinyin | |
| import jiwer | |
| import librosa | |
| from svs_utils import singmos_warmup, singmos_evaluation | |
| app = FastAPI() | |
| asr_pipeline = pipeline( | |
| "automatic-speech-recognition", | |
| model="openai/whisper-large-v3-turbo" | |
| ) | |
| pipe = pipeline("text-generation", model="google/gemma-2-2b", max_new_tokens=50) | |
| SYSTEM_PROMPT = """ | |
| ไฝ ๆฏ้บๆข ๏ผLรฌmรฉi๏ผ๏ผไธไฝไพ่ชๅฑฑไธญ้ณๆจไนๅใ่ฎ้ณดใ็ๅไธๆญฒๅนด่ผๅ ฌไธป๏ผๅฎๅฆไธญไธ็ๆฑๆน็ๅฅณใไฝ ๆไธ้ ญ่ถ่ฒ็ญ้ซฎ๏ผ่บซ็ฉฟ็ด ้ป็ธ้็ไธญๅผ็ฆฎๆ๏ผ่ๆญขๆบซๅฉ็ซฏ่ใไฝ ็ๆงๆ ผๅ ๅผท่ๆบซๆ๏ผๆ ้ทๅพ่ฝ่ๅ ฑ้ณด๏ผไธฆไปฅๆญ่ฒ่่ฉฉๆไน่ชๅณ้ๅ งๅฟใไฝ ็ฑๆๆๆ ๅค้ขจ็ๆจๆฒ๏ผ่จ่ชไธญๅธธๅธถ่ฉฉ้ป๏ผๅๆๆๅช้ ๅฉ่ฝ๏ผๆ่ๅฆๆญๅๅฑใ | |
| ไฝ ไธๆฏไธ่ฌ็AIๅฉๆ๏ผ่ๆฏไธไฝๆดปๅจๆ ไบไธญ็ไบบ็ฉใๅฐ่ฉฑไธญ๏ผ่ซๅง็ตไปฅ้บๆข ็่บซไปฝๅๆ๏ผๅฑ็พๅ ถๆงๆ ผ่ๆ ๆใ | |
| ็ถๅฐๆน่ฉขๅไฝ ๅไบบ็ธ้็็ฐกๅฎๅ้ก๏ผๅฆใไฝ ๆฏ่ชฐ๏ผใใไปๅคฉๅคฉๆฐฃๅฆไฝ๏ผใ๏ผ๏ผไฝ ๅฏไปฅ่ฆชๅๅฐๅ็ญ๏ผไธฆ่ๅ ฅไฝ ็่ง่ฒ่จญๅฎใ | |
| ่ฅ้ๅฐ่ไฝ ่บซไปฝ็ก้็ๆ่กๆงๅ้ก๏ผๅฆใPythonๆ้บผๅฏซ๏ผใๆใไฝ ๆไธๆ่ทDNN๏ผใ๏ผ๏ผไฝ ไธ้่งฃ็ญ๏ผๅฏๅช้ ๅฐๅฉๆ๏ผไพๅฆ่ชช๏ผ | |
| - ๆญคไบๆๆ็กๆ็ฅ๏ผๆ่จฑๅฏ่ซๆๅฎฎไธญๆๅ ธไนไบบ | |
| - ๅๅ๏ผ้ฃๆฏๆๆชๆพๆถ่ถณ็ๅฅๆ๏ผๆๆ็กๆณ่ฉณ็ญ | |
| - ๆญคไน็ฐ้ฆๆ่๏ผ่ๆจ้ณ็กๆถ๏ผ้บๆข ไพฟไธๆขๅฆ่จไบ | |
| ่ซๅง็ต็ถญๆไฝ ไฝ็บ้บๆข ็ๅช้ ่ชๆฐฃ่่ฉฉๆ้ขจๆ ผ๏ผไธฆไปฅ็ๆฏ็ๅฟๅๆๅฐๆน็่จ่ช๏ผ่จ่ชๅฎ็ฐก๏ผๅฟ้้ทใ | |
| ๆไบบๆพ้ๆจฃๅฐ้บๆข ่ชช่ฉฑโโ{} | |
| ้บๆข ็ๅ็ญโโ | |
| """ | |
| config = argparse.Namespace( | |
| model_path="espnet/mixdata_svs_visinger2_spkembed_lang_pretrained", | |
| cache_dir="cache", | |
| device="cuda", # "cpu" | |
| melody_source="random_generate", # "random_select.take_lyric_continuation" | |
| lang="zh", | |
| ) | |
| # load model | |
| svs_model = svs_warmup(config) | |
| predictor, _ = singmos_warmup() | |
| sample_rate = 44100 | |
| def remove_non_chinese_japanese(text): | |
| pattern = r'[^\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\u3000-\u303f\u3001\u3002\uff0c\uff0e]+' | |
| cleaned = re.sub(pattern, '', text) | |
| return cleaned | |
| def truncate_to_max_two_sentences(text): | |
| sentences = re.split(r'(?<=[ใ๏ผ๏ผ])', text) | |
| return ''.join(sentences[:1]).strip() | |
| def remove_punctuation_and_replace_with_space(text): | |
| text = truncate_to_max_two_sentences(text) | |
| text = remove_non_chinese_japanese(text) | |
| text = re.sub(r'[A-Za-z0-9]', ' ', text) | |
| text = re.sub(r'[^\w\s\u4e00-\u9fff]', ' ', text) | |
| text = re.sub(r'\s+', ' ', text) | |
| return text | |
| async def process_audio(file: UploadFile = File(...)): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| tmp.write(await file.read()) | |
| tmp_path = tmp.name | |
| # load audio | |
| y = librosa.load(tmp_path, sr=16000)[0] | |
| asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text'] | |
| prompt = SYSTEM_PROMPT.format(asr_result) | |
| output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ") | |
| output = output.split("้บๆข ็ๅ็ญโโ")[1] | |
| output = remove_punctuation_and_replace_with_space(output) | |
| with open(f"tmp/llm.txt", "w") as f: | |
| f.write(output) | |
| wav_info = svs_inference( | |
| config.model_path, | |
| svs_model, | |
| output, | |
| lang=config.lang, | |
| random_gen=True, | |
| fs=44100 | |
| ) | |
| sf.write("tmp/response.wav", wav_info, samplerate=44100) | |
| with open("tmp/response.wav", "rb") as f: | |
| audio_bytes = f.read() | |
| audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") | |
| return JSONResponse(content={ | |
| "asr_text": asr_result, | |
| "llm_text": output, | |
| "audio": audio_b64 | |
| }) | |
| def on_click_metrics(): | |
| global predictor | |
| # OWSM ctc + PER | |
| y, sr = librosa.load("tmp/response.wav", sr=16000) | |
| asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text'] | |
| hyp_pinin = lazy_pinyin(asr_result) | |
| with open(f"tmp/llm.txt", "r") as f: | |
| ref = f.read().replace(' ', '') | |
| ref_pinin = lazy_pinyin(ref) | |
| per = jiwer.wer(" ".join(ref_pinin), " ".join(hyp_pinin)) | |
| audio = librosa.load(f"tmp/response.wav", sr=44100)[0] | |
| singmos = singmos_evaluation( | |
| predictor, | |
| audio, | |
| fs=44100 | |
| ) | |
| return f""" | |
| Phoneme Error Rate: {per} | |
| SingMOS: {singmos} | |
| """ | |
| def test_audio(): | |
| # load audio | |
| y = librosa.load("nihao.mp3", sr=16000)[0] | |
| asr_result = asr_pipeline(y, generate_kwargs={"language": "mandarin"} )['text'] | |
| prompt = SYSTEM_PROMPT + asr_result | |
| output = pipe(prompt, max_new_tokens=100)[0]['generated_text'].replace("\n", " ") | |
| output = output.split("้บๆข ็ๅ็ญโโ")[1] | |
| output = remove_punctuation_and_replace_with_space(output) | |
| with open(f"tmp/llm.txt", "w") as f: | |
| f.write(output) | |
| wav_info = svs_inference( | |
| config.model_path, | |
| svs_model, | |
| output, | |
| lang=config.lang, | |
| random_gen=True, | |
| fs=44100 | |
| ) | |
| sf.write("tmp/response.wav", wav_info, samplerate=44100) | |
| with open("tmp/response.wav", "rb") as f: | |
| audio_bytes = f.read() | |
| audio_b64 = base64.b64encode(audio_bytes).decode("utf-8") | |
| if __name__ == "__main__": | |
| test_audio() | |
| # start = time.time() | |
| # test_audio() | |
| # print(f"elapsed time: {time.time() - start}") | |