HueyWoo's picture
Upload folder using huggingface_hub
f4da57d verified
#!/usr/bin/env python3
from pathlib import Path
import os
import argparse
from huggingface_hub import snapshot_download
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
HF_REPO_ID = "AeiROBOT/SenseVoice-Small-ko" # ์—…๋กœ๋“œํ•œ HF ๋ฆฌํฌ ID
LOCAL_DIR = "/home/khw/.aeirobot_models/SenseVoice-Small-ko"
# ----- SenseVoice ํ† ํฐ ํŒŒ์„œ -----
LANG_TOKENS = {"<|zh|>", "<|en|>", "<|yue|>", "<|ja|>", "<|ko|>", "<|nospeech|>"}
EMO_TOKENS = {"<|HAPPY|>", "<|SAD|>", "<|ANGRY|>", "<|NEUTRAL|>", "<|FEARFUL|>", "<|DISGUSTED|>", "<|SURPRISED|>"}
EVENT_TOKENS = {"<|BGM|>", "<|Speech|>", "<|Applause|>", "<|Laughter|>", "<|Cry|>", "<|Sneeze|>", "<|Breath|>", "<|Cough|>"}
WITH_ITN_TOKENS = {"<|withitn|>", "<|woitn|>"}
def _consume(prefixes, text: str):
for p in prefixes:
if text.startswith(p):
return p, text[len(p):]
return None, text
def parse_sensevoice_text(raw: str):
"""SenseVoice ์ถœ๋ ฅ ๋ฌธ์ž์—ด์—์„œ (lang, emo, event, with_itn, text) ๋ถ„๋ฆฌ.
์˜ˆ:
"<|ko|><|NEUTRAL|><|Speech|><|withitn|>์กฐ ๊ธˆ๋งŒ ์ƒ๊ฐ ์„ ํ•˜ ๋ฉด์„œ ์‚ด ๋ฉด ํ›จ์”ฌ ํŽธํ•  ๊ฑฐ์•ผ." ->
{
"language": "<|ko|>",
"emo": "<|NEUTRAL|>",
"event": "<|Speech|>",
"with_itn": "<|withitn|>",
"text": "์กฐ ๊ธˆ๋งŒ ์ƒ๊ฐ ์„ ํ•˜ ๋ฉด์„œ ์‚ด ๋ฉด ํ›จ์”ฌ ํŽธํ•  ๊ฑฐ์•ผ."
}
"""
if not raw:
return {"language": None, "emo": None, "event": None, "with_itn": None, "text": ""}
rest = raw.strip()
lang, rest = _consume(LANG_TOKENS, rest)
emo, rest = _consume(EMO_TOKENS, rest)
event, rest = _consume(EVENT_TOKENS, rest)
with_itn, rest = _consume(WITH_ITN_TOKENS, rest)
clean_text = rest.strip()
return {
"language": lang,
"emo": emo,
"event": event,
"with_itn": with_itn,
"text": clean_text,
}
def parse_args():
p = argparse.ArgumentParser()
p.add_argument("--wav_file", default="./test.wav", help="pretrained ๋ชจ๋ธ ์ด๋ฆ„ ๋˜๋Š” ๋กœ์ปฌ ๋””๋ ‰ํ„ฐ๋ฆฌ")
return p.parse_args()
def get_model():
local_path = snapshot_download(
repo_id=HF_REPO_ID,
repo_type="model",
local_dir=LOCAL_DIR,
local_dir_use_symlinks=False,
token=os.environ.get("HUGGINGFACE_HUB_TOKEN"), # private ์ด๋ฏ€๋กœ ํ•„์š”
)
print("๋‹ค์šด๋กœ๋“œ ๊ฒฝ๋กœ:", local_path)
# 2) AutoModel์— ๋กœ์ปฌ ๊ฒฝ๋กœ๋ฅผ ๋„˜๊ฒจ์„œ ์‚ฌ์šฉ
model_dir = local_path # ๋˜๋Š” LOCAL_DIR
model = AutoModel(
model=model_dir,
trust_remote_code=True,
remote_code=str(Path(model_dir) / "model.py"), # HF ๋ฆฌํฌ์— ์žˆ๋Š” model.py ์‚ฌ์šฉ
vad_model=None, #"fsmn-vad",
# vad_kwargs={"max_single_segment_time": 30000},
device="cuda:0",
)
return model
def main():
args = parse_args()
wav_path = args.wav_file
model = get_model()
# res = model.generate(
# input=wav_path,
# cache={},
# language="auto", # ๋˜๋Š” "ko"
# use_itn=True,
# batch_size_s=60,
# merge_vad=True,
# merge_length_s=15,
# )
res = model.generate(
input=wav_path,
cache={},
language="auto", # ๋˜๋Š” "ko"
use_itn=True,
batch_size=1,
)
raw_text = res[0]["text"]
parsed = parse_sensevoice_text(raw_text)
# ITN ํ›„์ฒ˜๋ฆฌ
pretty_text = rich_transcription_postprocess(parsed["text"]) if parsed["text"] else ""
print("=== Raw ===")
print(raw_text)
print("=== Parsed ===")
print("lang :", parsed["language"])
print("emo :", parsed["emo"])
print("event :", parsed["event"])
print("withitn:", parsed["with_itn"])
print("text :", pretty_text)
if __name__ == "__main__":
main()