Upload folder using huggingface_hub

c24bfdd verified 4 months ago

22.6 kB

	# SenseVoice-Small-ko (Fine-tuned SenseVoiceSmall on EDIE dataset)

	이 리포지터리는 SenseVoiceSmall를 한국어 음성/감정/이벤트 인식용 EDIE 데이터셋으로 파인튜닝한 모델입니다.

	- 베이스 모델: iic/SenseVoiceSmall
	- 테스크: STT (ASR) + Emotion (SER) + Event (AED)
	- 주요 라벨:
	- 텍스트 라벨
	- 감정 라벨: <\|HAPPY\|>, <\|SAD\|>, <\|ANGRY\|>, <\|NEUTRAL\|>, <\|FEARFUL\|>, <\|DISGUSTED\|>, <\|SURPRISED\|>


	## 0. 모델 입출력 포멧

	입력
	- input: 단일 wav 경로 또는 경로 리스트

	출력
	출력 예시 (AutoModel)
	- text: 인식된 텍스트
	- language: 언어 ID (<\|ko\|> 등)
	- emo: 감정 라벨 (<\|HAPPY\|>, <\|SAD\|> 등)
	- event: 이벤트 라벨 (<\|Speech\|>, <\|BGM\|> 등)


	## 1. 설치

	```bash
	pip install -U "funasr>=1.2.7" torch
	```

	GPU를 사용할 경우 사전에 CUDA 호환 PyTorch를 설치해 주세요

	## 2. 간단하게 모델 사용하기

	FunASR의 AutoModel을 이용하여 허깅페이스 모델 허브에서 모델 레파지토리의 모델을 바로 로드해서 사용할 수 있습니다.

	```python
	#!/usr/bin/env python3
	from pathlib import Path
	import os
	import argparse

	from huggingface_hub import snapshot_download
	from funasr import AutoModel
	from funasr.utils.postprocess_utils import rich_transcription_postprocess

	HF_REPO_ID = "AeiROBOT/SenseVoice-Small-ko" # 업로드한 HF 리포 ID
	LOCAL_DIR = "/home/khw/Workspace/SenseVoice/hf_models/SenseVoice-Small-ko"

	# ----- SenseVoice 토큰 파서 -----
	LANG_TOKENS = {"<\|zh\|>", "<\|en\|>", "<\|yue\|>", "<\|ja\|>", "<\|ko\|>", "<\|nospeech\|>"}
	EMO_TOKENS = {"<\|HAPPY\|>", "<\|SAD\|>", "<\|ANGRY\|>", "<\|NEUTRAL\|>", "<\|FEARFUL\|>", "<\|DISGUSTED\|>", "<\|SURPRISED\|>"}
	EVENT_TOKENS = {"<\|BGM\|>", "<\|Speech\|>", "<\|Applause\|>", "<\|Laughter\|>", "<\|Cry\|>", "<\|Sneeze\|>", "<\|Breath\|>", "<\|Cough\|>"}
	WITH_ITN_TOKENS = {"<\|withitn\|>", "<\|woitn\|>"}


	def _consume(prefixes, text: str):
	for p in prefixes:
	if text.startswith(p):
	return p, text[len(p):]
	return None, text


	def parse_sensevoice_text(raw: str):
	"""SenseVoice 출력 문자열에서 (lang, emo, event, with_itn, text) 분리.

	예:
	"<\|ko\|><\|NEUTRAL\|><\|Speech\|><\|withitn\|>조 금만 생각 을 하 면서 살 면 훨씬 편할 거야." ->
	{
	"language": "<\|ko\|>",
	"emo": "<\|NEUTRAL\|>",
	"event": "<\|Speech\|>",
	"with_itn": "<\|withitn\|>",
	"text": "조 금만 생각 을 하 면서 살 면 훨씬 편할 거야."
	}
	"""
	if not raw:
	return {"language": None, "emo": None, "event": None, "with_itn": None, "text": ""}

	rest = raw.strip()
	lang, rest = _consume(LANG_TOKENS, rest)
	emo, rest = _consume(EMO_TOKENS, rest)
	event, rest = _consume(EVENT_TOKENS, rest)
	with_itn, rest = _consume(WITH_ITN_TOKENS, rest)

	clean_text = rest.strip()
	return {
	"language": lang,
	"emo": emo,
	"event": event,
	"with_itn": with_itn,
	"text": clean_text,
	}


	def parse_args():
	p = argparse.ArgumentParser()
	p.add_argument("--wav_file", default="dataset/wav_dataset/DISGUSTED/test_2025_12_12_040201.wav", help="pretrained 모델 이름 또는 로컬 디렉터리")
	return p.parse_args()

	def get_model():
	local_path = snapshot_download(
	repo_id=HF_REPO_ID,
	repo_type="model",
	local_dir=LOCAL_DIR,
	local_dir_use_symlinks=False,
	token=os.environ.get("HUGGINGFACE_HUB_TOKEN"), # private 이므로 필요
	)
	print("다운로드 경로:", local_path)

	# 2) AutoModel에 로컬 경로를 넘겨서 사용
	model_dir = local_path # 또는 LOCAL_DIR

	model = AutoModel(
	model=model_dir,
	trust_remote_code=True,
	remote_code=str(Path(model_dir) / "model.py"), # HF 리포에 있는 model.py 사용
	vad_model="fsmn-vad",
	vad_kwargs={"max_single_segment_time": 30000},
	device="cuda:0",
	)

	return model

	def main():
	args = parse_args()
	wav_path = args.wav_file

	model = get_model()

	res = model.generate(
	input=wav_path,
	cache={},
	language="auto", # 또는 "ko"
	use_itn=True,
	batch_size_s=60,
	merge_vad=True,
	merge_length_s=15,
	)

	raw_text = res[0]["text"]
	parsed = parse_sensevoice_text(raw_text)

	# ITN 후처리
	pretty_text = rich_transcription_postprocess(parsed["text"]) if parsed["text"] else ""

	print("=== Raw ===")
	print(raw_text)
	print("=== Parsed ===")
	print("lang :", parsed["language"])
	print("emo :", parsed["emo"])
	print("event :", parsed["event"])
	print("withitn:", parsed["with_itn"])
	print("text :", pretty_text)


	if __name__ == "__main__":
	main()
	```


	## 3. 학습 데이터셋으로 평가하기

	```python
	#!/usr/bin/env python3
	import os
	import json
	import argparse
	import unicodedata
	from pathlib import Path
	from typing import List, Dict, Tuple, Optional

	import torch
	from funasr import AutoModel
	from funasr.utils.postprocess_utils import rich_transcription_postprocess


	# =======================
	# SenseVoice 토큰 파서
	# =======================
	LANG_TOKENS = {"<\|zh\|>", "<\|en\|>", "<\|yue\|>", "<\|ja\|>", "<\|ko\|>", "<\|nospeech\|>"}
	EMO_TOKENS = {"<\|HAPPY\|>", "<\|SAD\|>", "<\|ANGRY\|>", "<\|NEUTRAL\|>", "<\|FEARFUL\|>", "<\|DISGUSTED\|>", "<\|SURPRISED\|>"}
	EVENT_TOKENS = {"<\|BGM\|>", "<\|Speech\|>", "<\|Applause\|>", "<\|Laughter\|>", "<\|Cry\|>", "<\|Sneeze\|>", "<\|Breath\|>", "<\|Cough\|>"}
	WITH_ITN_TOKENS = {"<\|withitn\|>", "<\|woitn\|>"}


	def _consume(prefixes, text: str):
	for p in prefixes:
	if text.startswith(p):
	return p, text[len(p):]
	return None, text


	def parse_sensevoice_text(raw: str) -> Dict[str, Optional[str]]:
	if not raw:
	return {"language": None, "emo": None, "event": None, "with_itn": None, "text": ""}

	rest = raw.strip()
	lang, rest = _consume(LANG_TOKENS, rest)
	emo, rest = _consume(EMO_TOKENS, rest)
	event, rest = _consume(EVENT_TOKENS, rest)
	with_itn, rest = _consume(WITH_ITN_TOKENS, rest)

	clean_text = rest.strip()
	return {
	"language": lang,
	"emo": emo,
	"event": event,
	"with_itn": with_itn,
	"text": clean_text,
	}


	# =======================
	# 텍스트 정규화 & 지표
	# =======================

	def normalize_text(s: str, lower: bool, strip_punct: bool, strip_spaces: bool) -> str:
	if s is None:
	return ""
	t = s
	if lower:
	t = t.lower()
	if strip_punct:
	t = "".join(ch for ch in t if not unicodedata.category(ch).startswith("P"))
	if strip_spaces:
	t = "".join(t.split())
	return t


	def _levenshtein(a: List[str], b: List[str]) -> int:
	n, m = len(a), len(b)
	if n == 0:
	return m
	if m == 0:
	return n
	prev = list(range(m + 1))
	for i in range(1, n + 1):
	curr = [i] + [0] * m
	ai = a[i - 1]
	for j in range(1, m + 1):
	cost = 0 if ai == b[j - 1] else 1
	curr[j] = min(
	prev[j] + 1,
	curr[j - 1] + 1,
	prev[j - 1] + cost,
	)
	prev = curr
	return prev[m]


	def cer(ref: str, hyp: str) -> float:
	r = list(ref)
	h = list(hyp)
	dist = _levenshtein(r, h)
	return dist / max(1, len(r))


	def wer(ref: str, hyp: str) -> float:
	r = ref.split()
	h = hyp.split()
	dist = _levenshtein(r, h)
	return dist / max(1, len(r))


	def norm_emo(label: Optional[str]) -> str:
	if not label:
	return ""
	t = label.strip()
	if t.startswith("<\|") and t.endswith("\|>"):
	t = t[2:-2]
	return t.upper()


	# =======================
	# IO & argparse
	# =======================

	def parse_args():
	p = argparse.ArgumentParser()
	p.add_argument("--model-dir", default="/home/khw/Workspace/SenseVoice/outputs", help="finetune 산출물 디렉터리")
	p.add_argument("--jsonl", default="/home/khw/Workspace/SenseVoice/data/train.jsonl", help="입력 JSONL 경로")
	p.add_argument("--base-audio-dir", default="/home/khw/Workspace/SenseVoice", help="source 상대경로의 기준 디렉터리")
	p.add_argument("--remote-code", default="/home/khw/Workspace/SenseVoice/model.py", help="SenseVoice 모델 구현 경로")
	p.add_argument("--device", default=None, help="cuda:0 / cpu (미지정 시 자동 결정)")
	p.add_argument("--batch-size", type=int, default=64, help="배치 크기(짧은 음원 다수 가정)")
	p.add_argument("--use-best-ckpt", action="store_true", help="model.pt.best를 model.pt로 심볼릭 링크 생성")
	p.add_argument("--lang", default="ko", choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"], help="언어 강제 설정. 기본 ko")
	p.add_argument("--lower", action="store_true", help="정밀도 계산 시 소문자화")
	p.add_argument("--strip-punct", action="store_true", help="정밀도 계산 시 문장부호 제거")
	p.add_argument("--strip-spaces", action="store_true", help="정밀도 계산 시 모든 공백 제거")
	p.add_argument("--out", default="/home/khw/Workspace/SenseVoice/results/preds_train.jsonl", help="추론 결과 JSONL")
	return p.parse_args()


	def _find_latest_epoch_ckpt(model_dir: Path) -> Optional[Path]:
	"""model.pt.ep* 중에서 가장 큰 epoch 번호를 가진 체크포인트를 찾는다."""
	candidates = []
	for p in model_dir.glob("model.pt.ep*"):
	name = p.name
	try:
	# 이름에서 숫자 부분만 파싱: model.pt.ep50 -> 50
	ep_str = name.split("model.pt.ep", 1)[1]
	ep = int(ep_str)
	candidates.append((ep, p))
	except (IndexError, ValueError):
	# 패턴이 안 맞으면 무시
	continue

	if not candidates:
	return None

	candidates.sort(key=lambda x: x[0]) # epoch 오름차순 정렬
	return candidates[-1][1] # 가장 큰 epoch


	def prepare_checkpoint(model_dir: Path) -> Path:
	"""주어진 model_dir 안에서 사용할 체크포인트를 선택하고, model.pt를 준비한다.

	우선순위:
	1) model.pt.best
	2) model.pt.ep* 중 가장 큰 epoch
	3) model.pt (기존 파일)

	셋 다 없으면 SystemExit으로 종료.

	선택된 파일이 model.pt가 아니라면, model.pt를 해당 파일을 가리키는
	심볼릭 링크(또는 복사본)으로 만든다.
	"""
	best = model_dir / "model.pt.best"
	target = model_dir / "model.pt" # AutoModel이 최종적으로 보게 될 파일

	chosen: Optional[Path] = None

	# 1) model.pt.best 최우선
	if best.exists():
	chosen = best
	reason = "model.pt.best"
	else:
	# 2) 가장 마지막 epoch의 model.pt.ep*
	latest_ep = _find_latest_epoch_ckpt(model_dir)
	if latest_ep is not None:
	chosen = latest_ep
	reason = latest_ep.name
	# 3) 기존 model.pt
	elif target.exists():
	chosen = target
	reason = "existing model.pt"
	else:
	reason = "(none)"

	if chosen is None:
	raise SystemExit(
	f"[fatal] No checkpoint found in {model_dir}. "
	f"Expected one of: model.pt.best, model.pt.ep*, model.pt. Program will exit."
	)

	# 선택된 체크포인트를 model.pt로 맞춰준다 (링크 또는 복사)
	if chosen != target:
	if target.exists() or target.is_symlink():
	try:
	target.unlink()
	except Exception as e:
	print(f"[warn] failed to remove existing {target}: {e}")

	try:
	# 상대 이름으로 심볼릭 링크 생성
	target.symlink_to(chosen.name)
	print(f"[info] using checkpoint: {chosen.name} (linked as model.pt)")
	except Exception as e:
	# 일부 파일시스템/권한 환경에서 symlink가 안 될 수 있으므로, 복사로 폴백
	print(f"[warn] symlink failed ({e}), will try to copy instead.")
	import shutil
	try:
	shutil.copy2(str(chosen), str(target))
	print(f"[info] using checkpoint: {chosen.name} (copied to model.pt)")
	except Exception as e2:
	raise SystemExit(
	f"[fatal] failed to prepare checkpoint at {target}: {e2}. Program will exit."
	)
	else:
	print(f"[info] using checkpoint: {reason}")

	return chosen


	def load_items(jsonl_path: Path) -> List[Dict]:
	items = []
	with jsonl_path.open("r", encoding="utf-8") as f:
	for line in f:
	line = line.strip()
	if not line:
	continue
	try:
	obj = json.loads(line)
	items.append(obj)
	except Exception as e:
	print(f"[warn] skip bad line: {e}")
	return items


	def to_abs_paths(items: List[Dict], base_audio_dir: Path) -> Tuple[List[Dict], int]:
	missing = 0
	for it in items:
	src = it.get("source")
	if src:
	p = (base_audio_dir / src).resolve()
	if not p.exists():
	missing += 1
	it["abs_source"] = str(p)
	else:
	it["abs_source"] = None
	missing += 1
	return items, missing


	def batched(iterable, n: int):
	batch = []
	for x in iterable:
	batch.append(x)
	if len(batch) == n:
	yield batch
	batch = []
	if batch:
	yield batch


	# =======================
	# main
	# =======================

	def main():
	args = parse_args()

	model_dir = Path(args.model_dir)
	jsonl_path = Path(args.jsonl)
	base_audio_dir = Path(args.base_audio_dir)

	# 체크포인트 우선순위 적용: model.pt.best > model.pt.ep* (최대 epoch) > model.pt
	ckpt = prepare_checkpoint(model_dir)
	print(f"[info] final checkpoint file: {ckpt}")

	device = args.device or ("cuda:0" if torch.cuda.is_available() else "cpu")

	# model.py(remote_code)는 반드시 존재해야 한다. 없으면 바로 종료.
	remote_code_path = Path(args.remote_code)
	if not remote_code_path.exists():
	raise SystemExit(
	f"[fatal] remote_code not found at {remote_code_path}. "
	f"Expected model.py for SenseVoice. Program will exit."
	)

	trust_remote = True

	model = AutoModel(
	model=str(model_dir), # 로컬 디렉터리만 사용
	trust_remote_code=trust_remote,
	remote_code=str(remote_code_path),
	device=device,
	vad_model=None,
	)

	items = load_items(jsonl_path)
	items, _ = to_abs_paths(items, base_audio_dir)

	valid_items = [it for it in items if it.get("abs_source") and Path(it["abs_source"]).exists()]
	missing = len(items) - len(valid_items)
	if missing:
	print(f"[warn] {missing} items skipped due to missing files")

	out_path = Path(args.out)
	out_path.parent.mkdir(parents=True, exist_ok=True)

	total = len(valid_items)
	print(f"[info] total inputs used: {total}, device: {device}, language: {args.lang}")
	if total == 0:
	print("[exit] No valid audio found. Check --base-audio-dir or 'source' paths.")
	with out_path.open("w", encoding="utf-8") as wf:
	pass
	return

	# 지표 누적
	exact_matches = 0
	cer_sum = 0.0
	wer_sum = 0.0
	text_pairs = 0

	emo_correct = 0
	emo_total = 0

	written = 0
	with out_path.open("w", encoding="utf-8") as wf:
	for batch in batched(valid_items, args.batch_size):
	wav_list = [b["abs_source"] for b in batch]

	try:
	res = model.generate(
	input=wav_list,
	cache={},
	language=args.lang,
	use_itn=True,
	batch_size=len(wav_list),
	)
	except Exception as e:
	print(f"[error] inference failed on batch starting key={batch[0].get('key')}: {e}")
	continue

	for it, r in zip(batch, res):
	raw_text = r.get("text", "") or ""
	parsed = parse_sensevoice_text(raw_text)
	pretty_text = rich_transcription_postprocess(parsed["text"]) if parsed["text"] else ""

	ref_text = it.get("target") or ""

	# 텍스트 지표
	if ref_text:
	nt_ref = normalize_text(ref_text, args.lower, args.strip_punct, args.strip_spaces)
	nt_hyp = normalize_text(pretty_text, args.lower, args.strip_punct, args.strip_spaces)

	if nt_ref == nt_hyp:
	exact_matches += 1
	cer_sum += cer(nt_ref, nt_hyp)
	wer_sum += wer(nt_ref, nt_hyp)
	text_pairs += 1

	# 감정 지표
	tgt_emo_n = norm_emo(it.get("emo_target"))
	pred_emo_n = norm_emo(parsed["emo"])
	if tgt_emo_n:
	emo_total += 1
	if pred_emo_n == tgt_emo_n:
	emo_correct += 1

	out_obj = {
	"key": it.get("key"),
	"audio": it.get("abs_source"),
	"pred_raw": raw_text,
	"pred_text": pretty_text,
	"ref_text": ref_text,
	"pred_language": parsed["language"],
	"pred_emo": pred_emo_n or parsed["emo"] or "",
	"ref_emo": tgt_emo_n or it.get("emo_target") or "",
	"pred_event": parsed["event"] or "",
	"with_itn": parsed["with_itn"] or "",
	}
	wf.write(json.dumps(out_obj, ensure_ascii=False) + "\n")

	# ===== 사람이 보기 좋은 per-sample 출력 =====
	idx = written + 1
	print("\n[{}] key={}".format(idx, it.get("key")))
	print("REF_TEXT :", ref_text)
	print("REF_EMO :", tgt_emo_n or it.get("emo_target"))
	print("PRED_TEXT:", pretty_text)
	print("PRED_EMO :", pred_emo_n or parsed["emo"]) # 토큰 그대로 보여줘도 됨
	print("PRED_EVT :", parsed["event"]) # 이벤트도 같이 확인
	print("-" * 80)

	written += 1

	# 요약 출력
	print("\n===== Summary =====")
	print(f"Samples inferred: {written}")
	if text_pairs > 0:
	exact_acc = exact_matches / text_pairs * 100.0
	avg_cer = cer_sum / text_pairs
	avg_wer = wer_sum / text_pairs
	print(f"Text pairs (with ref): {text_pairs}")
	print(f"- Exact match accuracy: {exact_acc:.2f}%")
	print(f"- Avg CER: {avg_cer:.4f}")
	print(f"- Avg WER: {avg_wer:.4f}")
	else:
	print("No text references found; text metrics skipped.")

	if emo_total > 0:
	emo_acc = emo_correct / emo_total * 100.0
	print(f"Emotion pairs: {emo_total}")
	print(f"- Emotion accuracy: {emo_acc:.2f}%")
	else:
	print("No emotion references found; emotion metrics skipped.")

	print(f"Results saved to: {out_path}")


	if __name__ == "__main__":
	main()


	```


	## 4. 학습 후 허깅페이스에 모델 업로드

	upload_model_to_huggingface.py

	```python

	#!/usr/bin/env python3
	import os
	from pathlib import Path

	from huggingface_hub import HfApi, create_repo, upload_folder

	# ===== 사용자 설정 =====
	# 실제로 만들 Hugging Face 모델 repo ID (예시)
	REPO_ID = "AeiROBOT/SenseVoice-Small-ko" # <-- 원하는 이름으로 수정

	# 업로드할 로컬 폴더 (학습 결과)
	MODEL_DIR = Path("/home/khw/Workspace/SenseVoice/outputs")

	# 로컬에 있는 model.py를 함께 올리고 싶으면 (FunASR/SenseVoice용)
	# outputs 안에 이미 복사해 두었으면 생략 가능
	EXTRA_FILES = [
	Path("/home/khw/Workspace/SenseVoice/model.py"), # 없으면 주석 처리
	]


	def main():
	# 1) 토큰 가져오기 (환경변수 사용 권장)
	# 미리 export HUGGINGFACE_HUB_TOKEN=hf_xxx 하기
	token = os.environ.get("HUGGINGFACE_HUB_TOKEN")
	if token is None:
	raise RuntimeError(
	"HUGGINGFACE_HUB_TOKEN 환경변수가 설정되어 있지 않습니다. "
	"https://huggingface.co/settings/tokens 에서 토큰을 만들고,\n"
	"export HUGGINGFACE_HUB_TOKEN=hf_xxx 로 설정한 뒤 다시 실행하세요."
	)

	api = HfApi()

	# 2) 리포지터리 생성 (이미 있으면 exist_ok=True 로 그냥 통과)
	create_repo(
	repo_id=REPO_ID,
	token=token,
	private=True, # 비공개로 올리려면 True
	exist_ok=True,
	repo_type="model",
	)

	# 3) 추가로 올릴 파일(model.py 등)을 outputs 안으로 복사 (선택)
	# -> HF 리포 root에 README.md, model.pt, config.yaml, configuration.json, model.py 등이 같이 있도록 추천
	for extra in EXTRA_FILES:
	if extra.is_file():
	target = MODEL_DIR / extra.name
	if not target.exists():
	print(f"[info] copy {extra} -> {target}")
	target.write_bytes(extra.read_bytes())
	else:
	print(f"[warn] extra file not found: {extra}")

	# 3-1) 모델 카드(README) 업로드: 실행 위치(CWD)의 README_huggingface.md를 outputs/README.md로 복사
	# - HF 모델 허브는 repo 루트의 README.md를 모델 카드로 인식합니다.
	readme_src = Path.cwd() / "README_huggingface.md"
	readme_dst = MODEL_DIR / "README.md"
	if readme_src.is_file():
	print(f"[info] copy {readme_src} -> {readme_dst}")
	readme_dst.write_text(readme_src.read_text(encoding="utf-8"), encoding="utf-8")
	else:
	print(f"[warn] README_huggingface.md not found in CWD: {Path.cwd()}")

	# 4) 폴더 통째로 업로드
	print(f"[info] uploading folder: {MODEL_DIR} -> {REPO_ID}")
	upload_folder(
	repo_id=REPO_ID,
	folder_path=str(MODEL_DIR),
	path_in_repo=".", # 리포 루트에 그대로 올리기
	token=token,
	repo_type="model",
	ignore_patterns=[
	"model.pt.ep*", # 체크포인트들 제외
	".pt.ep", # 혹시 다른 파일명도 비슷하게 찍히면 같이 제외
	],
	)

	print("[done] uploaded to:", f"https://huggingface.co/{REPO_ID}")


	if __name__ == "__main__":
	main()


	```