File size: 3,772 Bytes
181ff6e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | import os
import json
import base64
import argparse
from pathlib import Path
try:
from datasets import load_dataset
except ImportError:
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "datasets==2.19.1", "soundfile", "librosa", "huggingface_hub"])
from datasets import load_dataset
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output", required=True)
args = parser.parse_args()
# Create output directory
out_dir = Path(args.output)
out_dir.mkdir(parents=True, exist_ok=True)
results = []
# Check for token
token = os.environ.get("HF_TOKEN")
if not token:
print("⚠️ Warning: No HF_TOKEN found in environment. Common Voice 17.0 might fail because it is gated.", flush=True)
sources = [
{"name": "CommonVoice", "path": "mozilla-foundation/common_voice_17_0", "config": "wo", "split": "test", "limit": 25},
{"name": "FLEURS", "path": "google/fleurs", "config": "wo_sn", "split": "test", "limit": 25}
]
for source in sources:
print(f"\n=> Loading {source['name']} ({source['path']} - {source['config']}) limit {source['limit']}...", flush=True)
try:
# We use streaming to avoid downloading the entire massive dataset
ds = load_dataset(source["path"], source["config"], split=source["split"], streaming=True, token=token, trust_remote_code=True)
count = 0
for row in ds:
if count >= source["limit"]:
break
# Different dataset structures
audio_array = None
sampling_rate = None
original_text = ""
if "audio" in row and row["audio"] is not None:
audio_dict = row["audio"]
if "array" in audio_dict:
audio_array = audio_dict["array"]
sampling_rate = audio_dict.get("sampling_rate", 16000)
if "sentence" in row:
original_text = row["sentence"]
elif "transcription" in row:
original_text = row["transcription"]
elif "text" in row:
original_text = row["text"]
elif "raw_transcription" in row:
original_text = row["raw_transcription"]
if audio_array is not None:
import soundfile as sf
from io import BytesIO
buf = BytesIO()
sf.write(buf, audio_array, sampling_rate, format='WAV')
wav_data = buf.getvalue()
b64_audio = base64.b64encode(wav_data).decode('utf-8')
results.append({
"source": source["name"],
"original_text": original_text,
"audio_base64": b64_audio
})
count += 1
if count % 5 == 0:
print(f"Downloaded {count}/{source['limit']} from {source['name']}", flush=True)
print(f"✅ Success for {source['name']}: {count} samples.", flush=True)
except Exception as e:
print(f"❌ Failed to load {source['name']}: {str(e)}", flush=True)
# Save to JSON
out_file = out_dir / "hf_samples.json"
with open(out_file, "w") as f:
json.dump(results, f)
print(f"\n🎉 Finished fetching. Saved {len(results)} total samples to {out_file}", flush=True)
if __name__ == "__main__":
main()
|