Spaces:

Jacong
/

muse

Runtime error

App Files Files Community

muse / eval_pipeline /split_lyrics.py

Jacong

Upload 96 files

aa9be1e verified about 1 month ago

raw

history blame contribute delete

3.3 kB

	#!/usr/bin/env python3
	"""
	Split existing lyrics: Split txt lyrics in a complete directory into Chinese and English directories
	Usage: python split_lyrics.py <source_directory> <target_base_directory>
	Example: python split_lyrics.py ./audio/sunov4_5 ./audio
	-> ./audio/sunov4_5_cn/transcription.jsonl (index 1-50 -> 0-49)
	-> ./audio/sunov4_5_en/transcription.jsonl (index 51-100 -> 0-49)

	Use case: When lyrics txt files have been transcribed before splitting audio
	"""
	import os, re, argparse, json
	from pathlib import Path

	def extract_idx(filename):
	matches = re.findall(r'\d+', os.path.splitext(filename)[0])
	return int(matches[-1]) if matches else None

	def read_lyrics(txt_path):
	"""Read txt file and extract lyrics"""
	try:
	with open(txt_path, 'r', encoding='utf-8') as f:
	lines = f.readlines()
	# Skip first line if it's a language identifier
	if lines and lines[0].strip().lower() in ['chinese', 'english', 'zh', 'en']:
	lines = lines[1:]
	return ' '.join(line.strip() for line in lines if line.strip())
	except:
	return ""

	def main():
	parser = argparse.ArgumentParser()
	parser.add_argument("src_dir", help="Source directory containing txt lyric files")
	parser.add_argument("dst_dir", help="Target directory (will generate name_cn and name_en)")
	args = parser.parse_args()

	src = Path(args.src_dir)
	dst = Path(args.dst_dir)
	name = src.name

	cn_dir = dst / f"{name}_cn"
	en_dir = dst / f"{name}_en"

	cn_trans, en_trans = [], []

	# Iterate through all txt files
	for txt_path in sorted(src.glob("*.txt")):
	idx = extract_idx(txt_path.name)
	if idx is None:
	continue

	lyrics = read_lyrics(txt_path)

	# Infer audio extension
	base = txt_path.stem
	audio_ext = ".mp3"
	for ext in ['.mp3', '.wav']:
	if (src / f"{base}{ext}").exists():
	audio_ext = ext
	break

	if 1 <= idx <= 50:
	new_idx = idx - 1
	new_name = f"{new_idx:06d}{audio_ext}"
	cn_trans.append({
	"file_path": str(cn_dir / new_name),
	"file_name": new_name,
	"file_idx": new_idx,
	"hyp_text": lyrics
	})
	elif 51 <= idx <= 100:
	new_idx = idx - 51
	new_name = f"{new_idx:06d}{audio_ext}"
	en_trans.append({
	"file_path": str(en_dir / new_name),
	"file_name": new_name,
	"file_idx": new_idx,
	"hyp_text": lyrics
	})

	# Write transcription.jsonl
	for trans_list, out_dir, lang in [(cn_trans, cn_dir, "cn"), (en_trans, en_dir, "en")]:
	if trans_list:
	out_dir.mkdir(parents=True, exist_ok=True)
	trans_list.sort(key=lambda x: x["file_idx"])
	out_file = out_dir / "transcription.jsonl"
	with open(out_file, 'w', encoding='utf-8') as f:
	for rec in trans_list:
	f.write(json.dumps(rec, ensure_ascii=False) + '\n')
	print(f"{out_file} ({len(trans_list)} songs)")

	print("Done!")

	if __name__ == "__main__":
	main()