File size: 3,296 Bytes
aa9be1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python3
"""
Split existing lyrics: Split txt lyrics in a complete directory into Chinese and English directories
Usage: python split_lyrics.py <source_directory> <target_base_directory>
Example: python split_lyrics.py ./audio/sunov4_5 ./audio
      -> ./audio/sunov4_5_cn/transcription.jsonl (index 1-50 -> 0-49)
      -> ./audio/sunov4_5_en/transcription.jsonl (index 51-100 -> 0-49)

Use case: When lyrics txt files have been transcribed before splitting audio
"""
import os, re, argparse, json
from pathlib import Path

def extract_idx(filename):
    matches = re.findall(r'\d+', os.path.splitext(filename)[0])
    return int(matches[-1]) if matches else None

def read_lyrics(txt_path):
    """Read txt file and extract lyrics"""
    try:
        with open(txt_path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
        # Skip first line if it's a language identifier
        if lines and lines[0].strip().lower() in ['chinese', 'english', 'zh', 'en']:
            lines = lines[1:]
        return ' '.join(line.strip() for line in lines if line.strip())
    except:
        return ""

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("src_dir", help="Source directory containing txt lyric files")
    parser.add_argument("dst_dir", help="Target directory (will generate name_cn and name_en)")
    args = parser.parse_args()
    
    src = Path(args.src_dir)
    dst = Path(args.dst_dir)
    name = src.name
    
    cn_dir = dst / f"{name}_cn"
    en_dir = dst / f"{name}_en"
    
    cn_trans, en_trans = [], []
    
    # Iterate through all txt files
    for txt_path in sorted(src.glob("*.txt")):
        idx = extract_idx(txt_path.name)
        if idx is None:
            continue
        
        lyrics = read_lyrics(txt_path)
        
        # Infer audio extension
        base = txt_path.stem
        audio_ext = ".mp3"
        for ext in ['.mp3', '.wav']:
            if (src / f"{base}{ext}").exists():
                audio_ext = ext
                break
        
        if 1 <= idx <= 50:
            new_idx = idx - 1
            new_name = f"{new_idx:06d}{audio_ext}"
            cn_trans.append({
                "file_path": str(cn_dir / new_name),
                "file_name": new_name,
                "file_idx": new_idx,
                "hyp_text": lyrics
            })
        elif 51 <= idx <= 100:
            new_idx = idx - 51
            new_name = f"{new_idx:06d}{audio_ext}"
            en_trans.append({
                "file_path": str(en_dir / new_name),
                "file_name": new_name,
                "file_idx": new_idx,
                "hyp_text": lyrics
            })
    
    # Write transcription.jsonl
    for trans_list, out_dir, lang in [(cn_trans, cn_dir, "cn"), (en_trans, en_dir, "en")]:
        if trans_list:
            out_dir.mkdir(parents=True, exist_ok=True)
            trans_list.sort(key=lambda x: x["file_idx"])
            out_file = out_dir / "transcription.jsonl"
            with open(out_file, 'w', encoding='utf-8') as f:
                for rec in trans_list:
                    f.write(json.dumps(rec, ensure_ascii=False) + '\n')
            print(f"{out_file} ({len(trans_list)} songs)")
    
    print("Done!")

if __name__ == "__main__":
    main()