representation-chizzler / scripts /publish_commonvoice_dataset.py
Reza2kn's picture
Update app for GPU-aware model loading and dataset fixes
67ba0d5 verified
import os
from pathlib import Path
import csv
import re
import sys
from datasets import Audio, Dataset, DatasetDict
from huggingface_hub import HfApi
DATASET_DIR = Path(os.getenv("COMMONVOICE_DIR", "CommonVoice24-FA")).resolve()
SPLITS = [
split.strip()
for split in os.getenv("COMMONVOICE_SPLITS", "").split(",")
if split.strip()
]
REPO_OVERRIDE = os.getenv("COMMONVOICE_REPO")
PRIVATE_REPO = os.getenv("COMMONVOICE_PRIVATE", "0") == "1"
REQUIRED_COLUMNS = {"path", "sentence"}
csv.field_size_limit(min(sys.maxsize, 10**7))
PREFIX_RE = re.compile(r"^common_voice_fa_(\d+)\.mp3$")
BUCKET_COUNT = int(os.getenv("COMMONVOICE_BUCKETS", "100"))
BUCKET_WIDTH = max(2, len(str(max(BUCKET_COUNT - 1, 0))))
def load_env(path: Path) -> dict:
data = {}
if not path.exists():
return data
for raw in path.read_text().splitlines():
line = raw.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
data[key.strip()] = value.strip().strip('"').strip("'")
return data
def dataset_card(repo_id: str, split_names: list[str]) -> str:
splits = ", ".join(split_names)
return f"""---
language:
- fa
license: cc0-1.0
pretty_name: Common Voice 24 (FA) - Audio Column
---
# Common Voice 24 (FA) - Audio Column
This dataset is a repackaging of the Persian subset of Mozilla Common Voice 24.0.
## What changed
- Added an `audio` column pointing to `clips/<bucket>/*.mp3` for easy playback in the Hub UI.
- Only kept `audio` and `sentence` columns (in that order).
## Splits
{splits}
## Notes
Additional TSV files that do not include audio paths (e.g. reports or sentence
metadata) are kept as raw files in the repo but are not exposed as dataset
splits.
## Source
Original data: https://huggingface.co/datasets/mozilla-foundation/common_voice_24_0
## Usage
```python
from datasets import load_dataset
ds = load_dataset("{repo_id}")
```
"""
def main() -> None:
env = load_env(Path(".env"))
token = (
os.getenv("HF_TOKEN")
or env.get("HF_TOKEN")
or env.get("HUGGINGFACEHUB_API_TOKEN")
or env.get("HF_API_TOKEN")
)
if not token:
raise SystemExit("HF token not found in .env (HF_TOKEN)")
if not DATASET_DIR.exists():
raise SystemExit(f"Dataset dir not found: {DATASET_DIR}")
tsv_files = sorted(DATASET_DIR.glob("*.tsv"))
if SPLITS:
tsv_files = [
DATASET_DIR / f"{name}.tsv"
for name in SPLITS
if (DATASET_DIR / f"{name}.tsv").exists()
]
data_files = {}
for path in tsv_files:
with path.open("r", encoding="utf-8", errors="replace") as handle:
reader = csv.reader(handle, delimiter="\t")
header = next(reader, [])
if not REQUIRED_COLUMNS.issubset(header):
continue
split_name = path.stem
data_files[split_name] = str(path)
if not data_files:
raise SystemExit(
f"No split TSV files found under {DATASET_DIR} for {SPLITS}"
)
api = HfApi(token=token)
username = api.whoami()["name"]
repo_id = REPO_OVERRIDE or f"{username}/commonvoice-24-fa"
api.create_repo(
repo_id, repo_type="dataset", private=PRIVATE_REPO, exist_ok=True
)
def bucket_for_clip(clip_path: str) -> str:
match = PREFIX_RE.match(clip_path)
if not match:
return "misc"
clip_id = int(match.group(1))
return f"{clip_id % BUCKET_COUNT:0{BUCKET_WIDTH}d}"
def tsv_generator(path: str):
with open(path, "r", encoding="utf-8", errors="replace") as handle:
reader = csv.reader(handle, delimiter="\t")
header = next(reader, [])
if not REQUIRED_COLUMNS.issubset(header):
return
path_idx = header.index("path")
sentence_idx = header.index("sentence")
for row in reader:
if len(row) != len(header):
continue
clip_path = row[path_idx].strip()
sentence = row[sentence_idx].strip()
if not clip_path:
continue
bucket = bucket_for_clip(clip_path)
yield {
"audio": f"clips/{bucket}/{clip_path}",
"sentence": sentence,
}
dataset_splits = {}
for split, path in data_files.items():
dataset_splits[split] = Dataset.from_generator(
tsv_generator, gen_kwargs={"path": path}
)
dataset = DatasetDict(dataset_splits)
dataset = dataset.cast_column("audio", Audio())
for split, split_ds in dataset.items():
dataset[split] = split_ds.select_columns(["audio", "sentence"])
current_dir = os.getcwd()
os.chdir(str(DATASET_DIR))
try:
dataset.push_to_hub(repo_id, private=PRIVATE_REPO, token=token)
finally:
os.chdir(current_dir)
api.upload_file(
path_or_fileobj=dataset_card(repo_id, sorted(data_files)).encode("utf-8"),
path_in_repo="README.md",
repo_id=repo_id,
repo_type="dataset",
commit_message="Add dataset card",
)
print(f"Dataset published: https://huggingface.co/datasets/{repo_id}")
if __name__ == "__main__":
main()