Spaces:
Running on Zero
Running on Zero
| import os | |
| from pathlib import Path | |
| import csv | |
| import re | |
| import sys | |
| from datasets import Audio, Dataset, DatasetDict | |
| from huggingface_hub import HfApi | |
| DATASET_DIR = Path(os.getenv("COMMONVOICE_DIR", "CommonVoice24-FA")).resolve() | |
| SPLITS = [ | |
| split.strip() | |
| for split in os.getenv("COMMONVOICE_SPLITS", "").split(",") | |
| if split.strip() | |
| ] | |
| REPO_OVERRIDE = os.getenv("COMMONVOICE_REPO") | |
| PRIVATE_REPO = os.getenv("COMMONVOICE_PRIVATE", "0") == "1" | |
| REQUIRED_COLUMNS = {"path", "sentence"} | |
| csv.field_size_limit(min(sys.maxsize, 10**7)) | |
| PREFIX_RE = re.compile(r"^common_voice_fa_(\d+)\.mp3$") | |
| BUCKET_COUNT = int(os.getenv("COMMONVOICE_BUCKETS", "100")) | |
| BUCKET_WIDTH = max(2, len(str(max(BUCKET_COUNT - 1, 0)))) | |
| def load_env(path: Path) -> dict: | |
| data = {} | |
| if not path.exists(): | |
| return data | |
| for raw in path.read_text().splitlines(): | |
| line = raw.strip() | |
| if not line or line.startswith("#") or "=" not in line: | |
| continue | |
| key, value = line.split("=", 1) | |
| data[key.strip()] = value.strip().strip('"').strip("'") | |
| return data | |
| def dataset_card(repo_id: str, split_names: list[str]) -> str: | |
| splits = ", ".join(split_names) | |
| return f"""--- | |
| language: | |
| - fa | |
| license: cc0-1.0 | |
| pretty_name: Common Voice 24 (FA) - Audio Column | |
| --- | |
| # Common Voice 24 (FA) - Audio Column | |
| This dataset is a repackaging of the Persian subset of Mozilla Common Voice 24.0. | |
| ## What changed | |
| - Added an `audio` column pointing to `clips/<bucket>/*.mp3` for easy playback in the Hub UI. | |
| - Only kept `audio` and `sentence` columns (in that order). | |
| ## Splits | |
| {splits} | |
| ## Notes | |
| Additional TSV files that do not include audio paths (e.g. reports or sentence | |
| metadata) are kept as raw files in the repo but are not exposed as dataset | |
| splits. | |
| ## Source | |
| Original data: https://huggingface.co/datasets/mozilla-foundation/common_voice_24_0 | |
| ## Usage | |
| ```python | |
| from datasets import load_dataset | |
| ds = load_dataset("{repo_id}") | |
| ``` | |
| """ | |
| def main() -> None: | |
| env = load_env(Path(".env")) | |
| token = ( | |
| os.getenv("HF_TOKEN") | |
| or env.get("HF_TOKEN") | |
| or env.get("HUGGINGFACEHUB_API_TOKEN") | |
| or env.get("HF_API_TOKEN") | |
| ) | |
| if not token: | |
| raise SystemExit("HF token not found in .env (HF_TOKEN)") | |
| if not DATASET_DIR.exists(): | |
| raise SystemExit(f"Dataset dir not found: {DATASET_DIR}") | |
| tsv_files = sorted(DATASET_DIR.glob("*.tsv")) | |
| if SPLITS: | |
| tsv_files = [ | |
| DATASET_DIR / f"{name}.tsv" | |
| for name in SPLITS | |
| if (DATASET_DIR / f"{name}.tsv").exists() | |
| ] | |
| data_files = {} | |
| for path in tsv_files: | |
| with path.open("r", encoding="utf-8", errors="replace") as handle: | |
| reader = csv.reader(handle, delimiter="\t") | |
| header = next(reader, []) | |
| if not REQUIRED_COLUMNS.issubset(header): | |
| continue | |
| split_name = path.stem | |
| data_files[split_name] = str(path) | |
| if not data_files: | |
| raise SystemExit( | |
| f"No split TSV files found under {DATASET_DIR} for {SPLITS}" | |
| ) | |
| api = HfApi(token=token) | |
| username = api.whoami()["name"] | |
| repo_id = REPO_OVERRIDE or f"{username}/commonvoice-24-fa" | |
| api.create_repo( | |
| repo_id, repo_type="dataset", private=PRIVATE_REPO, exist_ok=True | |
| ) | |
| def bucket_for_clip(clip_path: str) -> str: | |
| match = PREFIX_RE.match(clip_path) | |
| if not match: | |
| return "misc" | |
| clip_id = int(match.group(1)) | |
| return f"{clip_id % BUCKET_COUNT:0{BUCKET_WIDTH}d}" | |
| def tsv_generator(path: str): | |
| with open(path, "r", encoding="utf-8", errors="replace") as handle: | |
| reader = csv.reader(handle, delimiter="\t") | |
| header = next(reader, []) | |
| if not REQUIRED_COLUMNS.issubset(header): | |
| return | |
| path_idx = header.index("path") | |
| sentence_idx = header.index("sentence") | |
| for row in reader: | |
| if len(row) != len(header): | |
| continue | |
| clip_path = row[path_idx].strip() | |
| sentence = row[sentence_idx].strip() | |
| if not clip_path: | |
| continue | |
| bucket = bucket_for_clip(clip_path) | |
| yield { | |
| "audio": f"clips/{bucket}/{clip_path}", | |
| "sentence": sentence, | |
| } | |
| dataset_splits = {} | |
| for split, path in data_files.items(): | |
| dataset_splits[split] = Dataset.from_generator( | |
| tsv_generator, gen_kwargs={"path": path} | |
| ) | |
| dataset = DatasetDict(dataset_splits) | |
| dataset = dataset.cast_column("audio", Audio()) | |
| for split, split_ds in dataset.items(): | |
| dataset[split] = split_ds.select_columns(["audio", "sentence"]) | |
| current_dir = os.getcwd() | |
| os.chdir(str(DATASET_DIR)) | |
| try: | |
| dataset.push_to_hub(repo_id, private=PRIVATE_REPO, token=token) | |
| finally: | |
| os.chdir(current_dir) | |
| api.upload_file( | |
| path_or_fileobj=dataset_card(repo_id, sorted(data_files)).encode("utf-8"), | |
| path_in_repo="README.md", | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| commit_message="Add dataset card", | |
| ) | |
| print(f"Dataset published: https://huggingface.co/datasets/{repo_id}") | |
| if __name__ == "__main__": | |
| main() | |