AnodHuang commited on
Commit
297be61
·
verified ·
1 Parent(s): 7356008

Upload 4 files

Browse files
Files changed (4) hide show
  1. config.json +84 -0
  2. model.safetensors +3 -0
  3. preprocessor_config.json +9 -0
  4. verify_hubert.py +214 -0
config.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./hubert",
3
+ "activation_dropout": 0.1,
4
+ "apply_spec_augment": true,
5
+ "architectures": [
6
+ "HubertForSequenceClassification"
7
+ ],
8
+ "attention_dropout": 0.1,
9
+ "bos_token_id": 1,
10
+ "classifier_proj_size": 256,
11
+ "conv_bias": false,
12
+ "conv_dim": [
13
+ 512,
14
+ 512,
15
+ 512,
16
+ 512,
17
+ 512,
18
+ 512,
19
+ 512
20
+ ],
21
+ "conv_kernel": [
22
+ 10,
23
+ 3,
24
+ 3,
25
+ 3,
26
+ 3,
27
+ 2,
28
+ 2
29
+ ],
30
+ "conv_stride": [
31
+ 5,
32
+ 2,
33
+ 2,
34
+ 2,
35
+ 2,
36
+ 2,
37
+ 2
38
+ ],
39
+ "ctc_loss_reduction": "sum",
40
+ "ctc_zero_infinity": false,
41
+ "do_stable_layer_norm": false,
42
+ "eos_token_id": 2,
43
+ "feat_extract_activation": "gelu",
44
+ "feat_extract_dropout": 0.0,
45
+ "feat_extract_norm": "group",
46
+ "feat_proj_dropout": 0.1,
47
+ "feat_proj_layer_norm": true,
48
+ "final_dropout": 0.1,
49
+ "gradient_checkpointing": false,
50
+ "hidden_act": "gelu",
51
+ "hidden_dropout": 0.1,
52
+ "hidden_dropout_prob": 0.1,
53
+ "hidden_size": 768,
54
+ "id2label": {
55
+ "0": "spoof",
56
+ "1": "bonafide"
57
+ },
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 3072,
60
+ "label2id": {
61
+ "bonafide": 1,
62
+ "spoof": 0
63
+ },
64
+ "layer_norm_eps": 1e-05,
65
+ "layerdrop": 0.1,
66
+ "mask_feature_length": 10,
67
+ "mask_feature_min_masks": 0,
68
+ "mask_feature_prob": 0.0,
69
+ "mask_time_length": 10,
70
+ "mask_time_min_masks": 2,
71
+ "mask_time_prob": 0.05,
72
+ "model_type": "hubert",
73
+ "num_attention_heads": 12,
74
+ "num_conv_pos_embedding_groups": 16,
75
+ "num_conv_pos_embeddings": 128,
76
+ "num_feat_extract_layers": 7,
77
+ "num_hidden_layers": 12,
78
+ "pad_token_id": 0,
79
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
80
+ "torch_dtype": "float32",
81
+ "transformers_version": "4.46.3",
82
+ "use_weighted_layer_sum": false,
83
+ "vocab_size": 32
84
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1258b6f224f71867f70949bc719ce9b25e2fc96d32c265e9ade4b81fd3bcb7dd
3
+ size 378301944
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": false,
8
+ "sampling_rate": 16000
9
+ }
verify_hubert.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import time
4
+ import numpy as np
5
+ import torch
6
+ import soundfile as sf
7
+ from tqdm import tqdm
8
+ import pyarrow.parquet as pq
9
+
10
+ from transformers import AutoFeatureExtractor, HubertForSequenceClassification
11
+
12
+ # =========================
13
+ # 0) 你只改这里
14
+ # =========================
15
+ PARQUET_DIR = r"D:\capstone\asv_spoof\parquet"
16
+ MODEL_DIR = r"D:\capstone\models\hubert_snr"
17
+
18
+ SPLIT = "test" # "train" / "validation" / "test"
19
+ BATCH_SIZE = 32
20
+ CPU_THREADS = 8
21
+
22
+ # key 的定义:key=1 是 spoof,key=0 是 bonafide
23
+ KEY_SPOOF_VALUE = 1
24
+
25
+ PARQUET_FILE = os.path.join(PARQUET_DIR, f"{SPLIT}-00000-of-00001.parquet")
26
+ CHECK_LABEL_CONSISTENCY = True
27
+
28
+
29
+ # =========================
30
+ # 1) 音频解码/重采样(不落盘)
31
+ # =========================
32
+ def decode_audio(bytes_blob: bytes | None, path_str: str | None):
33
+ if bytes_blob is not None:
34
+ wav, sr = sf.read(io.BytesIO(bytes_blob), dtype="float32", always_2d=False)
35
+ else:
36
+ if not path_str or not os.path.exists(path_str):
37
+ raise RuntimeError("audio.bytes 为空,且 audio.path 不存在/不可用")
38
+ wav, sr = sf.read(path_str, dtype="float32", always_2d=False)
39
+
40
+ if isinstance(wav, np.ndarray) and wav.ndim > 1:
41
+ wav = wav.mean(axis=1)
42
+ return wav.astype(np.float32), int(sr)
43
+
44
+
45
+ def simple_resample(wav: np.ndarray, sr: int, new_sr: int) -> np.ndarray:
46
+ if sr == new_sr:
47
+ return wav
48
+ if wav.size == 0:
49
+ return wav
50
+ x_old = np.linspace(0, 1, num=wav.shape[0], endpoint=False)
51
+ new_len = int(round(wav.shape[0] * (new_sr / sr)))
52
+ x_new = np.linspace(0, 1, num=new_len, endpoint=False)
53
+ return np.interp(x_new, x_old, wav).astype(np.float32)
54
+
55
+
56
+ def key_to_label01(k) -> int:
57
+ v = int(k)
58
+ return 1 if v == KEY_SPOOF_VALUE else 0
59
+
60
+
61
+ def system_id_to_label01(sid: str) -> int:
62
+ sid = str(sid).strip()
63
+ return 0 if sid == "-" else 1 # '-' bonafide, 'Axx' spoof
64
+
65
+
66
+ # =========================
67
+ # 2) 设备 & 模型(HuBERT)
68
+ # =========================
69
+ torch.set_num_threads(CPU_THREADS)
70
+
71
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
72
+ print("Device:", device)
73
+ if device.type == "cuda":
74
+ print("GPU:", torch.cuda.get_device_name(0))
75
+ torch.backends.cudnn.benchmark = True
76
+
77
+ use_amp = (device.type == "cuda")
78
+
79
+ processor = AutoFeatureExtractor.from_pretrained(MODEL_DIR)
80
+ model = HubertForSequenceClassification.from_pretrained(MODEL_DIR).to(device).eval()
81
+
82
+ target_sr = getattr(processor, "sampling_rate", 16000)
83
+
84
+
85
+ # =========================
86
+ # 3) 读 parquet
87
+ # =========================
88
+ pf = pq.ParquetFile(PARQUET_FILE)
89
+ num_rows = pf.metadata.num_rows
90
+ num_batches = (num_rows + BATCH_SIZE - 1) // BATCH_SIZE
91
+
92
+ print(f"Parquet: {PARQUET_FILE}")
93
+ print(f"Rows: {num_rows}, Batches: {num_batches}, BatchSize: {BATCH_SIZE}")
94
+
95
+
96
+ # =========================
97
+ # 4) 推理 + 指标统计
98
+ # =========================
99
+ correct = 0
100
+ total = 0
101
+ tp = fp = tn = fn = 0 # pos=spoof=1
102
+
103
+ mismatch = 0
104
+ checked = 0
105
+
106
+ t0 = time.time()
107
+ with torch.no_grad():
108
+ pbar = tqdm(total=num_batches, desc=f"Predicting [{SPLIT}]", unit="batch")
109
+
110
+ for rb in pf.iter_batches(batch_size=BATCH_SIZE, columns=["audio", "key", "system_id"]):
111
+ audio_struct = rb.column(rb.schema.get_field_index("audio"))
112
+ key_arr = rb.column(rb.schema.get_field_index("key"))
113
+ sys_arr = rb.column(rb.schema.get_field_index("system_id"))
114
+
115
+ bytes_arr = audio_struct.field("bytes") if audio_struct.type.get_field_index("bytes") != -1 else None
116
+ path_arr = audio_struct.field("path") if audio_struct.type.get_field_index("path") != -1 else None
117
+
118
+ keys = key_arr.to_pylist()
119
+ sysids = sys_arr.to_pylist()
120
+ bytes_list = bytes_arr.to_pylist() if bytes_arr is not None else [None] * len(keys)
121
+ path_list = path_arr.to_pylist() if path_arr is not None else [None] * len(keys)
122
+
123
+ waves = []
124
+ labels = []
125
+
126
+ for b, p, k, sid in zip(bytes_list, path_list, keys, sysids):
127
+ y = key_to_label01(k)
128
+ labels.append(y)
129
+
130
+ if CHECK_LABEL_CONSISTENCY:
131
+ y2 = system_id_to_label01(sid)
132
+ checked += 1
133
+ if y != y2:
134
+ mismatch += 1
135
+
136
+ wav, sr = decode_audio(b, p)
137
+ wav = simple_resample(wav, sr, target_sr)
138
+ waves.append(wav)
139
+
140
+ inputs = processor(
141
+ waves,
142
+ sampling_rate=target_sr,
143
+ return_tensors="pt",
144
+ padding=True,
145
+ )
146
+ inputs = {k: v.to(device, non_blocking=True) for k, v in inputs.items()}
147
+ labels_t = torch.tensor(labels, dtype=torch.long, device=device)
148
+
149
+ if use_amp:
150
+ with torch.amp.autocast("cuda"):
151
+ logits = model(**inputs).logits
152
+ else:
153
+ logits = model(**inputs).logits
154
+
155
+ preds = torch.argmax(logits, dim=-1)
156
+
157
+ total += labels_t.numel()
158
+ correct += (preds == labels_t).sum().item()
159
+
160
+ tp += ((preds == 1) & (labels_t == 1)).sum().item()
161
+ fp += ((preds == 1) & (labels_t == 0)).sum().item()
162
+ tn += ((preds == 0) & (labels_t == 0)).sum().item()
163
+ fn += ((preds == 0) & (labels_t == 1)).sum().item()
164
+
165
+ pbar.update(1)
166
+
167
+ pbar.close()
168
+
169
+ elapsed = time.time() - t0
170
+
171
+
172
+ # =========================
173
+ # 5) 计算指标
174
+ # =========================
175
+ acc = correct / max(total, 1)
176
+ eps = 1e-12
177
+ precision = tp / (tp + fp + eps)
178
+ recall = tp / (tp + fn + eps) # TPR
179
+ f1 = 2 * precision * recall / (precision + recall + eps)
180
+ fnr = fn / (fn + tp + eps)
181
+ fpr = fp / (fp + tn + eps)
182
+
183
+ print("\n===== Summary =====")
184
+ print(f"Split : {SPLIT}")
185
+ print(f"Accuracy : {acc:.6f} ({correct}/{total})")
186
+ print(f"Confusion : TP={tp}, FP={fp}, TN={tn}, FN={fn}")
187
+ print(f"Time : {elapsed:.2f}s, {total / max(elapsed,1e-9):.2f} samples/s")
188
+
189
+ if CHECK_LABEL_CONSISTENCY:
190
+ print(f"Label check: key vs system_id mismatches = {mismatch}/{checked}")
191
+
192
+ print("\n===== Metrics (pos=spoof=1) =====")
193
+ print(f"Precision : {precision:.6f}")
194
+ print(f"FNR : {fnr:.6f}")
195
+ print(f"FPR : {fpr:.6f}")
196
+ print(f"F1-score : {f1:.6f}")
197
+
198
+ '''
199
+ ===== Summary =====
200
+ Split : test
201
+ Accuracy : 0.975097 (69463/71237)
202
+ Confusion : TP=62229, FP=121, TN=7234, FN=1653
203
+ Time : 4366.41s, 16.31 samples/s
204
+ Label check: key vs system_id mismatches = 0/71237
205
+
206
+ ===== Metrics (pos=spoof=1) =====
207
+ Precision : 0.998059
208
+ FNR : 0.025876
209
+ FPR : 0.016451
210
+ F1-score : 0.985947
211
+
212
+ 进程已结束,退出代码为 0
213
+
214
+ '''