|
|
|
|
|
""" |
|
|
Split existing lyrics: Split txt lyrics in a complete directory into Chinese and English directories |
|
|
Usage: python split_lyrics.py <source_directory> <target_base_directory> |
|
|
Example: python split_lyrics.py ./audio/sunov4_5 ./audio |
|
|
-> ./audio/sunov4_5_cn/transcription.jsonl (index 1-50 -> 0-49) |
|
|
-> ./audio/sunov4_5_en/transcription.jsonl (index 51-100 -> 0-49) |
|
|
|
|
|
Use case: When lyrics txt files have been transcribed before splitting audio |
|
|
""" |
|
|
import os, re, argparse, json |
|
|
from pathlib import Path |
|
|
|
|
|
def extract_idx(filename): |
|
|
matches = re.findall(r'\d+', os.path.splitext(filename)[0]) |
|
|
return int(matches[-1]) if matches else None |
|
|
|
|
|
def read_lyrics(txt_path): |
|
|
"""Read txt file and extract lyrics""" |
|
|
try: |
|
|
with open(txt_path, 'r', encoding='utf-8') as f: |
|
|
lines = f.readlines() |
|
|
|
|
|
if lines and lines[0].strip().lower() in ['chinese', 'english', 'zh', 'en']: |
|
|
lines = lines[1:] |
|
|
return ' '.join(line.strip() for line in lines if line.strip()) |
|
|
except: |
|
|
return "" |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("src_dir", help="Source directory containing txt lyric files") |
|
|
parser.add_argument("dst_dir", help="Target directory (will generate name_cn and name_en)") |
|
|
args = parser.parse_args() |
|
|
|
|
|
src = Path(args.src_dir) |
|
|
dst = Path(args.dst_dir) |
|
|
name = src.name |
|
|
|
|
|
cn_dir = dst / f"{name}_cn" |
|
|
en_dir = dst / f"{name}_en" |
|
|
|
|
|
cn_trans, en_trans = [], [] |
|
|
|
|
|
|
|
|
for txt_path in sorted(src.glob("*.txt")): |
|
|
idx = extract_idx(txt_path.name) |
|
|
if idx is None: |
|
|
continue |
|
|
|
|
|
lyrics = read_lyrics(txt_path) |
|
|
|
|
|
|
|
|
base = txt_path.stem |
|
|
audio_ext = ".mp3" |
|
|
for ext in ['.mp3', '.wav']: |
|
|
if (src / f"{base}{ext}").exists(): |
|
|
audio_ext = ext |
|
|
break |
|
|
|
|
|
if 1 <= idx <= 50: |
|
|
new_idx = idx - 1 |
|
|
new_name = f"{new_idx:06d}{audio_ext}" |
|
|
cn_trans.append({ |
|
|
"file_path": str(cn_dir / new_name), |
|
|
"file_name": new_name, |
|
|
"file_idx": new_idx, |
|
|
"hyp_text": lyrics |
|
|
}) |
|
|
elif 51 <= idx <= 100: |
|
|
new_idx = idx - 51 |
|
|
new_name = f"{new_idx:06d}{audio_ext}" |
|
|
en_trans.append({ |
|
|
"file_path": str(en_dir / new_name), |
|
|
"file_name": new_name, |
|
|
"file_idx": new_idx, |
|
|
"hyp_text": lyrics |
|
|
}) |
|
|
|
|
|
|
|
|
for trans_list, out_dir, lang in [(cn_trans, cn_dir, "cn"), (en_trans, en_dir, "en")]: |
|
|
if trans_list: |
|
|
out_dir.mkdir(parents=True, exist_ok=True) |
|
|
trans_list.sort(key=lambda x: x["file_idx"]) |
|
|
out_file = out_dir / "transcription.jsonl" |
|
|
with open(out_file, 'w', encoding='utf-8') as f: |
|
|
for rec in trans_list: |
|
|
f.write(json.dumps(rec, ensure_ascii=False) + '\n') |
|
|
print(f"{out_file} ({len(trans_list)} songs)") |
|
|
|
|
|
print("Done!") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|