File size: 1,575 Bytes
aa9be1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/env python3
"""
Split audio: Split 100 songs into Chinese and English directories
Usage: python split_audio.py <input_directory> <output_directory>
Example: python split_audio.py /path/to/audio /path/to/output
      Input: 1-50 Chinese, 51-100 English
      Output: model_cn/ (renumbered to 0-49), model_en/ (renumbered to 0-49)
      Output numbering aligns with GT file_index
"""
import os, re, shutil, argparse
from pathlib import Path

def extract_idx(filename):
    matches = re.findall(r'\d+', os.path.splitext(filename)[0])
    return int(matches[-1]) if matches else None

def split(src, dst):
    src, dst = Path(src), Path(dst)
    name = src.name
    cn_dir, en_dir = dst / f"{name}_cn", dst / f"{name}_en"
    cn_dir.mkdir(parents=True, exist_ok=True)
    en_dir.mkdir(parents=True, exist_ok=True)
    
    for f in sorted(src.glob("*.*")):
        if f.suffix.lower() not in ['.wav', '.mp3']: continue
        idx = extract_idx(f.name)
        if idx is None: continue
        
        if 1 <= idx <= 50:
            # Renumber to 0-49 to match GT file_index
            shutil.copy2(f, cn_dir / f"{idx-1:06d}{f.suffix}")
        elif 51 <= idx <= 100:
            # Renumber to 0-49 to match GT file_index
            shutil.copy2(f, en_dir / f"{idx-51:06d}{f.suffix}")
    
    print(f"Split {name} -> {cn_dir.name}, {en_dir.name}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("input_dir")
    parser.add_argument("output_dir")
    args = parser.parse_args()
    split(args.input_dir, args.output_dir)