File size: 3,832 Bytes
f4da57d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 | #!/usr/bin/env python3
from pathlib import Path
import os
import argparse
from huggingface_hub import snapshot_download
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
HF_REPO_ID = "AeiROBOT/SenseVoice-Small-ko" # ์
๋ก๋ํ HF ๋ฆฌํฌ ID
LOCAL_DIR = "/home/khw/.aeirobot_models/SenseVoice-Small-ko"
# ----- SenseVoice ํ ํฐ ํ์ -----
LANG_TOKENS = {"<|zh|>", "<|en|>", "<|yue|>", "<|ja|>", "<|ko|>", "<|nospeech|>"}
EMO_TOKENS = {"<|HAPPY|>", "<|SAD|>", "<|ANGRY|>", "<|NEUTRAL|>", "<|FEARFUL|>", "<|DISGUSTED|>", "<|SURPRISED|>"}
EVENT_TOKENS = {"<|BGM|>", "<|Speech|>", "<|Applause|>", "<|Laughter|>", "<|Cry|>", "<|Sneeze|>", "<|Breath|>", "<|Cough|>"}
WITH_ITN_TOKENS = {"<|withitn|>", "<|woitn|>"}
def _consume(prefixes, text: str):
for p in prefixes:
if text.startswith(p):
return p, text[len(p):]
return None, text
def parse_sensevoice_text(raw: str):
"""SenseVoice ์ถ๋ ฅ ๋ฌธ์์ด์์ (lang, emo, event, with_itn, text) ๋ถ๋ฆฌ.
์:
"<|ko|><|NEUTRAL|><|Speech|><|withitn|>์กฐ ๊ธ๋ง ์๊ฐ ์ ํ ๋ฉด์ ์ด ๋ฉด ํจ์ฌ ํธํ ๊ฑฐ์ผ." ->
{
"language": "<|ko|>",
"emo": "<|NEUTRAL|>",
"event": "<|Speech|>",
"with_itn": "<|withitn|>",
"text": "์กฐ ๊ธ๋ง ์๊ฐ ์ ํ ๋ฉด์ ์ด ๋ฉด ํจ์ฌ ํธํ ๊ฑฐ์ผ."
}
"""
if not raw:
return {"language": None, "emo": None, "event": None, "with_itn": None, "text": ""}
rest = raw.strip()
lang, rest = _consume(LANG_TOKENS, rest)
emo, rest = _consume(EMO_TOKENS, rest)
event, rest = _consume(EVENT_TOKENS, rest)
with_itn, rest = _consume(WITH_ITN_TOKENS, rest)
clean_text = rest.strip()
return {
"language": lang,
"emo": emo,
"event": event,
"with_itn": with_itn,
"text": clean_text,
}
def parse_args():
p = argparse.ArgumentParser()
p.add_argument("--wav_file", default="./test.wav", help="pretrained ๋ชจ๋ธ ์ด๋ฆ ๋๋ ๋ก์ปฌ ๋๋ ํฐ๋ฆฌ")
return p.parse_args()
def get_model():
local_path = snapshot_download(
repo_id=HF_REPO_ID,
repo_type="model",
local_dir=LOCAL_DIR,
local_dir_use_symlinks=False,
token=os.environ.get("HUGGINGFACE_HUB_TOKEN"), # private ์ด๋ฏ๋ก ํ์
)
print("๋ค์ด๋ก๋ ๊ฒฝ๋ก:", local_path)
# 2) AutoModel์ ๋ก์ปฌ ๊ฒฝ๋ก๋ฅผ ๋๊ฒจ์ ์ฌ์ฉ
model_dir = local_path # ๋๋ LOCAL_DIR
model = AutoModel(
model=model_dir,
trust_remote_code=True,
remote_code=str(Path(model_dir) / "model.py"), # HF ๋ฆฌํฌ์ ์๋ model.py ์ฌ์ฉ
vad_model=None, #"fsmn-vad",
# vad_kwargs={"max_single_segment_time": 30000},
device="cuda:0",
)
return model
def main():
args = parse_args()
wav_path = args.wav_file
model = get_model()
# res = model.generate(
# input=wav_path,
# cache={},
# language="auto", # ๋๋ "ko"
# use_itn=True,
# batch_size_s=60,
# merge_vad=True,
# merge_length_s=15,
# )
res = model.generate(
input=wav_path,
cache={},
language="auto", # ๋๋ "ko"
use_itn=True,
batch_size=1,
)
raw_text = res[0]["text"]
parsed = parse_sensevoice_text(raw_text)
# ITN ํ์ฒ๋ฆฌ
pretty_text = rich_transcription_postprocess(parsed["text"]) if parsed["text"] else ""
print("=== Raw ===")
print(raw_text)
print("=== Parsed ===")
print("lang :", parsed["language"])
print("emo :", parsed["emo"])
print("event :", parsed["event"])
print("withitn:", parsed["with_itn"])
print("text :", pretty_text)
if __name__ == "__main__":
main()
|