| from datasets import load_dataset | |
| import soundfile as sf | |
| import os | |
| os.makedirs("data/real", exist_ok=True) | |
| os.makedirs("data/fake", exist_ok=True) | |
| dataset = load_dataset("garystafford/deepfake-audio-detection", split="train") | |
| real_count = 0 | |
| fake_count = 0 | |
| for item in dataset: | |
| audio = item["audio"] | |
| label = item["label"] # 0=real, 1=fake | |
| if label == 0 and real_count < 250: | |
| sf.write(f"data/real/real_{real_count:04d}.wav", | |
| audio["array"], audio["sampling_rate"]) | |
| real_count += 1 | |
| elif label == 1 and fake_count < 250: | |
| sf.write(f"data/fake/fake_{fake_count:04d}.wav", | |
| audio["array"], audio["sampling_rate"]) | |
| fake_count += 1 | |
| if real_count >= 250 and fake_count >= 250: | |
| break | |
| print(f"Downloaded: {real_count} real, {fake_count} fake") |