muse / eval_pipeline /split_lyrics.py
Jacong's picture
Upload 96 files
aa9be1e verified
#!/usr/bin/env python3
"""
Split existing lyrics: Split txt lyrics in a complete directory into Chinese and English directories
Usage: python split_lyrics.py <source_directory> <target_base_directory>
Example: python split_lyrics.py ./audio/sunov4_5 ./audio
-> ./audio/sunov4_5_cn/transcription.jsonl (index 1-50 -> 0-49)
-> ./audio/sunov4_5_en/transcription.jsonl (index 51-100 -> 0-49)
Use case: When lyrics txt files have been transcribed before splitting audio
"""
import os, re, argparse, json
from pathlib import Path
def extract_idx(filename):
matches = re.findall(r'\d+', os.path.splitext(filename)[0])
return int(matches[-1]) if matches else None
def read_lyrics(txt_path):
"""Read txt file and extract lyrics"""
try:
with open(txt_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
# Skip first line if it's a language identifier
if lines and lines[0].strip().lower() in ['chinese', 'english', 'zh', 'en']:
lines = lines[1:]
return ' '.join(line.strip() for line in lines if line.strip())
except:
return ""
def main():
parser = argparse.ArgumentParser()
parser.add_argument("src_dir", help="Source directory containing txt lyric files")
parser.add_argument("dst_dir", help="Target directory (will generate name_cn and name_en)")
args = parser.parse_args()
src = Path(args.src_dir)
dst = Path(args.dst_dir)
name = src.name
cn_dir = dst / f"{name}_cn"
en_dir = dst / f"{name}_en"
cn_trans, en_trans = [], []
# Iterate through all txt files
for txt_path in sorted(src.glob("*.txt")):
idx = extract_idx(txt_path.name)
if idx is None:
continue
lyrics = read_lyrics(txt_path)
# Infer audio extension
base = txt_path.stem
audio_ext = ".mp3"
for ext in ['.mp3', '.wav']:
if (src / f"{base}{ext}").exists():
audio_ext = ext
break
if 1 <= idx <= 50:
new_idx = idx - 1
new_name = f"{new_idx:06d}{audio_ext}"
cn_trans.append({
"file_path": str(cn_dir / new_name),
"file_name": new_name,
"file_idx": new_idx,
"hyp_text": lyrics
})
elif 51 <= idx <= 100:
new_idx = idx - 51
new_name = f"{new_idx:06d}{audio_ext}"
en_trans.append({
"file_path": str(en_dir / new_name),
"file_name": new_name,
"file_idx": new_idx,
"hyp_text": lyrics
})
# Write transcription.jsonl
for trans_list, out_dir, lang in [(cn_trans, cn_dir, "cn"), (en_trans, en_dir, "en")]:
if trans_list:
out_dir.mkdir(parents=True, exist_ok=True)
trans_list.sort(key=lambda x: x["file_idx"])
out_file = out_dir / "transcription.jsonl"
with open(out_file, 'w', encoding='utf-8') as f:
for rec in trans_list:
f.write(json.dumps(rec, ensure_ascii=False) + '\n')
print(f"{out_file} ({len(trans_list)} songs)")
print("Done!")
if __name__ == "__main__":
main()