#!/usr/bin/env python3 from pathlib import Path import os import argparse from huggingface_hub import snapshot_download from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess HF_REPO_ID = "AeiROBOT/SenseVoice-Small-ko" # 업로드한 HF 리포 ID LOCAL_DIR = "/home/khw/.aeirobot_models/SenseVoice-Small-ko" # ----- SenseVoice 토큰 파서 ----- LANG_TOKENS = {"<|zh|>", "<|en|>", "<|yue|>", "<|ja|>", "<|ko|>", "<|nospeech|>"} EMO_TOKENS = {"<|HAPPY|>", "<|SAD|>", "<|ANGRY|>", "<|NEUTRAL|>", "<|FEARFUL|>", "<|DISGUSTED|>", "<|SURPRISED|>"} EVENT_TOKENS = {"<|BGM|>", "<|Speech|>", "<|Applause|>", "<|Laughter|>", "<|Cry|>", "<|Sneeze|>", "<|Breath|>", "<|Cough|>"} WITH_ITN_TOKENS = {"<|withitn|>", "<|woitn|>"} def _consume(prefixes, text: str): for p in prefixes: if text.startswith(p): return p, text[len(p):] return None, text def parse_sensevoice_text(raw: str): """SenseVoice 출력 문자열에서 (lang, emo, event, with_itn, text) 분리. 예: "<|ko|><|NEUTRAL|><|Speech|><|withitn|>조 금만 생각 을 하 면서 살 면 훨씬 편할 거야." -> { "language": "<|ko|>", "emo": "<|NEUTRAL|>", "event": "<|Speech|>", "with_itn": "<|withitn|>", "text": "조 금만 생각 을 하 면서 살 면 훨씬 편할 거야." } """ if not raw: return {"language": None, "emo": None, "event": None, "with_itn": None, "text": ""} rest = raw.strip() lang, rest = _consume(LANG_TOKENS, rest) emo, rest = _consume(EMO_TOKENS, rest) event, rest = _consume(EVENT_TOKENS, rest) with_itn, rest = _consume(WITH_ITN_TOKENS, rest) clean_text = rest.strip() return { "language": lang, "emo": emo, "event": event, "with_itn": with_itn, "text": clean_text, } def parse_args(): p = argparse.ArgumentParser() p.add_argument("--wav_file", default="./test.wav", help="pretrained 모델 이름 또는 로컬 디렉터리") return p.parse_args() def get_model(): local_path = snapshot_download( repo_id=HF_REPO_ID, repo_type="model", local_dir=LOCAL_DIR, local_dir_use_symlinks=False, token=os.environ.get("HUGGINGFACE_HUB_TOKEN"), # private 이므로 필요 ) print("다운로드 경로:", local_path) # 2) AutoModel에 로컬 경로를 넘겨서 사용 model_dir = local_path # 또는 LOCAL_DIR model = AutoModel( model=model_dir, trust_remote_code=True, remote_code=str(Path(model_dir) / "model.py"), # HF 리포에 있는 model.py 사용 vad_model=None, #"fsmn-vad", # vad_kwargs={"max_single_segment_time": 30000}, device="cuda:0", ) return model def main(): args = parse_args() wav_path = args.wav_file model = get_model() # res = model.generate( # input=wav_path, # cache={}, # language="auto", # 또는 "ko" # use_itn=True, # batch_size_s=60, # merge_vad=True, # merge_length_s=15, # ) res = model.generate( input=wav_path, cache={}, language="auto", # 또는 "ko" use_itn=True, batch_size=1, ) raw_text = res[0]["text"] parsed = parse_sensevoice_text(raw_text) # ITN 후처리 pretty_text = rich_transcription_postprocess(parsed["text"]) if parsed["text"] else "" print("=== Raw ===") print(raw_text) print("=== Parsed ===") print("lang :", parsed["language"]) print("emo :", parsed["emo"]) print("event :", parsed["event"]) print("withitn:", parsed["with_itn"]) print("text :", pretty_text) if __name__ == "__main__": main()