hidude562 commited on
Commit
fb18897
·
verified ·
1 Parent(s): ff02e9f

Upload extract_features.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. extract_features.py +161 -0
extract_features.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Extract F0 (pyin) + RMS from WAVs, tokenize text, compute durations."""
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import numpy as np
7
+ import torch
8
+ import librosa
9
+
10
+
11
+ # Character vocabulary: PAD=0, UNK=1, space=2, a-z=3-28, 0-9=29-38, punct 39+
12
+ VOCAB = {chr(0): 0} # PAD
13
+ VOCAB['<UNK>'] = 1
14
+ VOCAB[' '] = 2
15
+ for i, c in enumerate('abcdefghijklmnopqrstuvwxyz'):
16
+ VOCAB[c] = 3 + i
17
+ for i, c in enumerate('0123456789'):
18
+ VOCAB[c] = 29 + i
19
+ PUNCT = ".,;:!?'-\"()/"
20
+ for i, c in enumerate(PUNCT):
21
+ VOCAB[c] = 39 + i
22
+ VOCAB_SIZE = max(VOCAB.values()) + 1
23
+
24
+
25
+ def tokenize(text):
26
+ """Lowercase text → char ID list."""
27
+ text = text.lower()
28
+ return [VOCAB.get(c, VOCAB['<UNK>']) for c in text]
29
+
30
+
31
+ def proportional_durations(n_chars, n_frames):
32
+ """Split n_frames proportionally across n_chars."""
33
+ if n_chars == 0:
34
+ return []
35
+ base = n_frames // n_chars
36
+ remainder = n_frames % n_chars
37
+ durations = [base + (1 if i < remainder else 0) for i in range(n_chars)]
38
+ return durations
39
+
40
+
41
+ def extract_one(wav_path, text, sr=24000, hop_length=2400):
42
+ """Extract features for a single sample."""
43
+ y, _ = librosa.load(wav_path, sr=sr, mono=True)
44
+
45
+ # F0 via pyin
46
+ f0, voiced_flag, _ = librosa.pyin(
47
+ y, fmin=50, fmax=600, sr=sr, hop_length=hop_length
48
+ )
49
+ # Clip to 300 Hz to suppress octave jumps; mark >300 as unvoiced
50
+ too_high = ~np.isnan(f0) & (f0 > 300)
51
+ voiced_flag[too_high] = False
52
+ f0 = np.where(np.isnan(f0), 0.0, np.clip(f0, 50, 300))
53
+
54
+ # RMS
55
+ rms = librosa.feature.rms(y=y, hop_length=hop_length, frame_length=hop_length)[0]
56
+
57
+ # Align lengths
58
+ n_frames = min(len(f0), len(rms))
59
+ f0 = f0[:n_frames]
60
+ voiced_flag = voiced_flag[:n_frames]
61
+ rms = rms[:n_frames]
62
+
63
+ # Log space
64
+ voiced_mask = voiced_flag.astype(bool)
65
+ log_f0 = np.zeros_like(f0)
66
+ log_f0[voiced_mask] = np.log(f0[voiced_mask])
67
+
68
+ log_rms = np.log(rms + 1e-8)
69
+
70
+ # Tokenize
71
+ char_ids = tokenize(text)
72
+
73
+ # Duration alignment
74
+ durations = proportional_durations(len(char_ids), n_frames)
75
+
76
+ return {
77
+ 'char_ids': np.array(char_ids, dtype=np.int64),
78
+ 'durations': np.array(durations, dtype=np.int64),
79
+ 'log_f0': log_f0.astype(np.float32),
80
+ 'log_rms': log_rms.astype(np.float32),
81
+ 'voiced_mask': voiced_mask,
82
+ 'n_frames': n_frames,
83
+ 'text': text,
84
+ }
85
+
86
+
87
+ def main():
88
+ parser = argparse.ArgumentParser()
89
+ parser.add_argument('--audio_dir', required=True)
90
+ parser.add_argument('--transcripts', required=True)
91
+ parser.add_argument('--output', default='features.pt')
92
+ args = parser.parse_args()
93
+
94
+ with open(args.transcripts) as f:
95
+ transcripts = json.load(f)
96
+
97
+ # Filter to sample_*.wav keys only
98
+ keys = sorted([k for k in transcripts if k.startswith('sample_') and k.endswith('.wav')])
99
+ print(f"Processing {len(keys)} samples...")
100
+
101
+ samples = []
102
+ all_voiced_f0 = []
103
+ all_log_rms = []
104
+
105
+ for i, key in enumerate(keys):
106
+ wav_path = os.path.join(args.audio_dir, key)
107
+ if not os.path.exists(wav_path):
108
+ print(f" SKIP {key}: file not found")
109
+ continue
110
+
111
+ feat = extract_one(wav_path, transcripts[key])
112
+ samples.append(feat)
113
+
114
+ # Collect for normalization
115
+ if feat['voiced_mask'].any():
116
+ all_voiced_f0.append(feat['log_f0'][feat['voiced_mask']])
117
+ all_log_rms.append(feat['log_rms'])
118
+
119
+ if (i + 1) % 200 == 0:
120
+ print(f" {i+1}/{len(keys)} done")
121
+
122
+ # Global normalization stats
123
+ all_voiced_f0 = np.concatenate(all_voiced_f0)
124
+ all_log_rms = np.concatenate(all_log_rms)
125
+
126
+ norm_stats = {
127
+ 'f0_mean': float(np.mean(all_voiced_f0)),
128
+ 'f0_std': float(np.std(all_voiced_f0)),
129
+ 'rms_mean': float(np.mean(all_log_rms)),
130
+ 'rms_std': float(np.std(all_log_rms)),
131
+ }
132
+ print(f"Norm stats: {norm_stats}")
133
+
134
+ # Z-score normalize
135
+ for s in samples:
136
+ voiced = s['voiced_mask']
137
+ s['log_f0'][voiced] = (s['log_f0'][voiced] - norm_stats['f0_mean']) / norm_stats['f0_std']
138
+ s['log_rms'] = (s['log_rms'] - norm_stats['rms_mean']) / norm_stats['rms_std']
139
+
140
+ # Print stats
141
+ voiced_ratios = [s['voiced_mask'].mean() for s in samples]
142
+ frame_counts = [s['n_frames'] for s in samples]
143
+ print(f"Samples: {len(samples)}")
144
+ print(f"Voiced ratio: {np.mean(voiced_ratios):.3f} ± {np.std(voiced_ratios):.3f}")
145
+ print(f"Frame counts: {np.mean(frame_counts):.1f} ± {np.std(frame_counts):.1f} "
146
+ f"(min={np.min(frame_counts)}, max={np.max(frame_counts)})")
147
+
148
+ # Convert to tensors for saving
149
+ for s in samples:
150
+ s['char_ids'] = torch.from_numpy(s['char_ids'])
151
+ s['durations'] = torch.from_numpy(s['durations'])
152
+ s['log_f0'] = torch.from_numpy(s['log_f0'])
153
+ s['log_rms'] = torch.from_numpy(s['log_rms'])
154
+ s['voiced_mask'] = torch.from_numpy(s['voiced_mask'])
155
+
156
+ torch.save({'samples': samples, 'norm_stats': norm_stats, 'vocab_size': VOCAB_SIZE}, args.output)
157
+ print(f"Saved to {args.output}")
158
+
159
+
160
+ if __name__ == '__main__':
161
+ main()