Spaces:

Reza2kn
/

representation-chizzler

Running on Zero

App Files Files Community

representation-chizzler / scripts /publish_commonvoice_dataset.py

Reza2kn

Update app for GPU-aware model loading and dataset fixes

67ba0d5 verified 4 months ago

raw

history blame contribute delete

5.35 kB

	import os
	from pathlib import Path

	import csv
	import re
	import sys

	from datasets import Audio, Dataset, DatasetDict
	from huggingface_hub import HfApi


	DATASET_DIR = Path(os.getenv("COMMONVOICE_DIR", "CommonVoice24-FA")).resolve()
	SPLITS = [
	split.strip()
	for split in os.getenv("COMMONVOICE_SPLITS", "").split(",")
	if split.strip()
	]
	REPO_OVERRIDE = os.getenv("COMMONVOICE_REPO")
	PRIVATE_REPO = os.getenv("COMMONVOICE_PRIVATE", "0") == "1"

	REQUIRED_COLUMNS = {"path", "sentence"}
	csv.field_size_limit(min(sys.maxsize, 10**7))
	PREFIX_RE = re.compile(r"^common_voice_fa_(\d+)\.mp3$")
	BUCKET_COUNT = int(os.getenv("COMMONVOICE_BUCKETS", "100"))
	BUCKET_WIDTH = max(2, len(str(max(BUCKET_COUNT - 1, 0))))


	def load_env(path: Path) -> dict:
	data = {}
	if not path.exists():
	return data
	for raw in path.read_text().splitlines():
	line = raw.strip()
	if not line or line.startswith("#") or "=" not in line:
	continue
	key, value = line.split("=", 1)
	data[key.strip()] = value.strip().strip('"').strip("'")
	return data


	def dataset_card(repo_id: str, split_names: list[str]) -> str:
	splits = ", ".join(split_names)
	return f"""---
	language:
	- fa
	license: cc0-1.0
	pretty_name: Common Voice 24 (FA) - Audio Column
	---

	# Common Voice 24 (FA) - Audio Column

	This dataset is a repackaging of the Persian subset of Mozilla Common Voice 24.0.

	## What changed
	- Added an `audio` column pointing to `clips/<bucket>/*.mp3` for easy playback in the Hub UI.
	- Only kept `audio` and `sentence` columns (in that order).

	## Splits
	{splits}

	## Notes
	Additional TSV files that do not include audio paths (e.g. reports or sentence
	metadata) are kept as raw files in the repo but are not exposed as dataset
	splits.

	## Source
	Original data: https://huggingface.co/datasets/mozilla-foundation/common_voice_24_0

	## Usage
	```python
	from datasets import load_dataset

	ds = load_dataset("{repo_id}")
	```
	"""


	def main() -> None:
	env = load_env(Path(".env"))
	token = (
	os.getenv("HF_TOKEN")
	or env.get("HF_TOKEN")
	or env.get("HUGGINGFACEHUB_API_TOKEN")
	or env.get("HF_API_TOKEN")
	)
	if not token:
	raise SystemExit("HF token not found in .env (HF_TOKEN)")

	if not DATASET_DIR.exists():
	raise SystemExit(f"Dataset dir not found: {DATASET_DIR}")

	tsv_files = sorted(DATASET_DIR.glob("*.tsv"))
	if SPLITS:
	tsv_files = [
	DATASET_DIR / f"{name}.tsv"
	for name in SPLITS
	if (DATASET_DIR / f"{name}.tsv").exists()
	]

	data_files = {}
	for path in tsv_files:
	with path.open("r", encoding="utf-8", errors="replace") as handle:
	reader = csv.reader(handle, delimiter="\t")
	header = next(reader, [])
	if not REQUIRED_COLUMNS.issubset(header):
	continue
	split_name = path.stem
	data_files[split_name] = str(path)

	if not data_files:
	raise SystemExit(
	f"No split TSV files found under {DATASET_DIR} for {SPLITS}"
	)

	api = HfApi(token=token)
	username = api.whoami()["name"]
	repo_id = REPO_OVERRIDE or f"{username}/commonvoice-24-fa"

	api.create_repo(
	repo_id, repo_type="dataset", private=PRIVATE_REPO, exist_ok=True
	)

	def bucket_for_clip(clip_path: str) -> str:
	match = PREFIX_RE.match(clip_path)
	if not match:
	return "misc"
	clip_id = int(match.group(1))
	return f"{clip_id % BUCKET_COUNT:0{BUCKET_WIDTH}d}"

	def tsv_generator(path: str):
	with open(path, "r", encoding="utf-8", errors="replace") as handle:
	reader = csv.reader(handle, delimiter="\t")
	header = next(reader, [])
	if not REQUIRED_COLUMNS.issubset(header):
	return
	path_idx = header.index("path")
	sentence_idx = header.index("sentence")
	for row in reader:
	if len(row) != len(header):
	continue
	clip_path = row[path_idx].strip()
	sentence = row[sentence_idx].strip()
	if not clip_path:
	continue
	bucket = bucket_for_clip(clip_path)
	yield {
	"audio": f"clips/{bucket}/{clip_path}",
	"sentence": sentence,
	}

	dataset_splits = {}
	for split, path in data_files.items():
	dataset_splits[split] = Dataset.from_generator(
	tsv_generator, gen_kwargs={"path": path}
	)

	dataset = DatasetDict(dataset_splits)

	dataset = dataset.cast_column("audio", Audio())
	for split, split_ds in dataset.items():
	dataset[split] = split_ds.select_columns(["audio", "sentence"])

	current_dir = os.getcwd()
	os.chdir(str(DATASET_DIR))
	try:
	dataset.push_to_hub(repo_id, private=PRIVATE_REPO, token=token)
	finally:
	os.chdir(current_dir)

	api.upload_file(
	path_or_fileobj=dataset_card(repo_id, sorted(data_files)).encode("utf-8"),
	path_in_repo="README.md",
	repo_id=repo_id,
	repo_type="dataset",
	commit_message="Add dataset card",
	)

	print(f"Dataset published: https://huggingface.co/datasets/{repo_id}")


	if __name__ == "__main__":
	main()