Spaces:

Reza2kn
/

representation-chizzler

Running on Zero

App Files Files Community

Reza2kn commited on Jan 5

Commit

67ba0d5

verified ·

1 Parent(s): 2a60ca8

Update app for GPU-aware model loading and dataset fixes

Browse files

Files changed (4) hide show

.gitignore +5 -0
app.py +38 -5
scripts/publish_commonvoice_dataset.py +59 -35
scripts/upload_commonvoice_chunks.py +115 -5

.gitignore CHANGED Viewed

@@ -30,6 +30,11 @@ build/
 chizzler_cache/
 CommonVoice24-FA/
 .commonvoice_upload_checkpoint.json
 *.ogg
 # macOS

 chizzler_cache/
 CommonVoice24-FA/
 .commonvoice_upload_checkpoint.json
+commonvoice_upload.pid
+commonvoice_upload.log
+commonvoice_progress.log
+commonvoice_progress.pid
+.commonvoice_progress_state.json
 *.ogg
 # macOS

app.py CHANGED Viewed

@@ -180,10 +180,10 @@ def select_device() -> torch.device:
     return torch.device("cpu")
-def initialize_models():
     log_progress("Initializing models...")
-    device = select_device()
     log_progress(f"Using {device.type.upper()} for all operations", 2)
     log_progress("Loading Silero VAD model...", 2)
@@ -214,7 +214,29 @@ def initialize_models():
     return vad_model, utils, mpnet_model, config, device
-vad_model, vad_utils, mpnet_model, mpnet_config, device = initialize_models()
 def ensure_mono(waveform: torch.Tensor) -> torch.Tensor:
@@ -283,6 +305,7 @@ def get_speech_timestamps(
 ) -> List[dict]:
     log_progress("Detecting speech segments...", enabled=log)
     (get_speech_timestamps_fn, _, _, _, _) = vad_utils
     speech_timestamps = get_speech_timestamps_fn(
@@ -332,7 +355,10 @@ def extract_speech_waveform(
 def denoise_audio_chunk(
-    audio_tensor: torch.Tensor, chunk_size: int = 5 * DEFAULT_SAMPLE_RATE
 ) -> torch.Tensor:
     chunks = []
     for i in range(0, audio_tensor.size(1), chunk_size):
@@ -375,6 +401,7 @@ def process_waveform(
     max_gap: float = 4.0,
     log: bool = True,
 ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], str, bool]:
     if waveform.device != device:
         waveform = waveform.to(device)
     log_progress("Stage 1: Voice Activity Detection", 2, enabled=log)
@@ -414,7 +441,9 @@ def process_waveform(
     log_progress("Stage 2: MP-SENet denoising", 2, enabled=log)
     with torch.no_grad():
-        denoised_waveform = denoise_audio_chunk(vad_waveform)
     return vad_waveform, denoised_waveform, "\n".join(details), True
@@ -430,6 +459,7 @@ def process_audio_file(
 ) -> Tuple[str, str, str, str]:
     log_progress(f"Processing: {Path(audio_path).name}")
     waveform, sample_rate = load_audio_file(audio_path)
     vad_waveform, denoised_waveform, details, has_speech = process_waveform(
         waveform, sample_rate, threshold=threshold, max_gap=max_gap, log=True
@@ -721,6 +751,9 @@ def process_dataset_and_push(
     if not dataset_id:
         return "Provide a dataset ID or URL."
     config = config.strip() or None
     split = split.strip()
     audio_column = audio_column.strip()

     return torch.device("cpu")
+def initialize_models(device_override: Optional[torch.device] = None):
     log_progress("Initializing models...")
+    device = device_override or select_device()
     log_progress(f"Using {device.type.upper()} for all operations", 2)
     log_progress("Loading Silero VAD model...", 2)
     return vad_model, utils, mpnet_model, config, device
+vad_model = None
+vad_utils = None
+mpnet_model = None
+mpnet_config = None
+device = None
+def get_models():
+    global vad_model, vad_utils, mpnet_model, mpnet_config, device
+    desired_device = select_device()
+    if vad_model is None or mpnet_model is None or mpnet_config is None:
+        vad_model, vad_utils, mpnet_model, mpnet_config, device = (
+            initialize_models(desired_device)
+        )
+        return vad_model, vad_utils, mpnet_model, mpnet_config, device
+    if device is None or str(device) != str(desired_device):
+        log_progress(f"Moving models to {desired_device}...", 2)
+        vad_model = vad_model.to(desired_device)
+        mpnet_model = mpnet_model.to(desired_device)
+        device = desired_device
+    return vad_model, vad_utils, mpnet_model, mpnet_config, device
 def ensure_mono(waveform: torch.Tensor) -> torch.Tensor:
 ) -> List[dict]:
     log_progress("Detecting speech segments...", enabled=log)
+    vad_model, vad_utils, _, _, _ = get_models()
     (get_speech_timestamps_fn, _, _, _, _) = vad_utils
     speech_timestamps = get_speech_timestamps_fn(
 def denoise_audio_chunk(
+    audio_tensor: torch.Tensor,
+    mpnet_model: torch.nn.Module,
+    mpnet_config: AttrDict,
+    chunk_size: int = 5 * DEFAULT_SAMPLE_RATE,
 ) -> torch.Tensor:
     chunks = []
     for i in range(0, audio_tensor.size(1), chunk_size):
     max_gap: float = 4.0,
     log: bool = True,
 ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], str, bool]:
+    vad_model, vad_utils, mpnet_model, mpnet_config, device = get_models()
     if waveform.device != device:
         waveform = waveform.to(device)
     log_progress("Stage 1: Voice Activity Detection", 2, enabled=log)
     log_progress("Stage 2: MP-SENet denoising", 2, enabled=log)
     with torch.no_grad():
+        denoised_waveform = denoise_audio_chunk(
+            vad_waveform, mpnet_model, mpnet_config
+        )
     return vad_waveform, denoised_waveform, "\n".join(details), True
 ) -> Tuple[str, str, str, str]:
     log_progress(f"Processing: {Path(audio_path).name}")
     waveform, sample_rate = load_audio_file(audio_path)
+    _, _, _, mpnet_config, _ = get_models()
     vad_waveform, denoised_waveform, details, has_speech = process_waveform(
         waveform, sample_rate, threshold=threshold, max_gap=max_gap, log=True
     if not dataset_id:
         return "Provide a dataset ID or URL."
+    # Ensure models are loaded on the correct device before heavy processing.
+    get_models()
     config = config.strip() or None
     split = split.strip()
     audio_column = audio_column.strip()

scripts/publish_commonvoice_dataset.py CHANGED Viewed

@@ -2,6 +2,8 @@ import os
 from pathlib import Path
 import csv
 from datasets import Audio, Dataset, DatasetDict
 from huggingface_hub import HfApi
@@ -10,21 +12,17 @@ from huggingface_hub import HfApi
 DATASET_DIR = Path(os.getenv("COMMONVOICE_DIR", "CommonVoice24-FA")).resolve()
 SPLITS = [
     split.strip()
-    for split in os.getenv("COMMONVOICE_SPLITS", "train,dev,test").split(",")
     if split.strip()
 ]
 REPO_OVERRIDE = os.getenv("COMMONVOICE_REPO")
 PRIVATE_REPO = os.getenv("COMMONVOICE_PRIVATE", "0") == "1"
-DROP_COLUMNS = {
-    "client_id",
-    "sentence_id",
-    "sentence_domain",
-    "accents",
-    "variant",
-    "segment",
-    "path",
-}
 def load_env(path: Path) -> dict:
@@ -40,7 +38,8 @@ def load_env(path: Path) -> dict:
     return data
-def dataset_card(repo_id: str) -> str:
     return f"""---
 language:
 - fa
@@ -53,11 +52,16 @@ pretty_name: Common Voice 24 (FA) - Audio Column
 This dataset is a repackaging of the Persian subset of Mozilla Common Voice 24.0.
 ## What changed
-- Added an `audio` column pointing to `clips/*.mp3` for easy playback in the Hub UI.
-- Removed columns: `client_id`, `sentence_id`, `sentence_domain`, `accents`,
-  `variant`, `segment`, and `path`.
-- Kept columns like `sentence`, `up_votes`, `down_votes`, `age`, `gender`, and
-  `locale`.
 ## Source
 Original data: https://huggingface.co/datasets/mozilla-foundation/common_voice_24_0
@@ -85,11 +89,23 @@ def main() -> None:
     if not DATASET_DIR.exists():
         raise SystemExit(f"Dataset dir not found: {DATASET_DIR}")
     data_files = {}
-    for split in SPLITS:
-        tsv_path = DATASET_DIR / f"{split}.tsv"
-        if tsv_path.exists():
-            data_files[split] = str(tsv_path)
     if not data_files:
         raise SystemExit(
@@ -104,14 +120,33 @@ def main() -> None:
         repo_id, repo_type="dataset", private=PRIVATE_REPO, exist_ok=True
     )
     def tsv_generator(path: str):
         with open(path, "r", encoding="utf-8", errors="replace") as handle:
             reader = csv.reader(handle, delimiter="\t")
-            header = next(reader)
             for row in reader:
                 if len(row) != len(header):
                     continue
-                yield dict(zip(header, row))
     dataset_splits = {}
     for split, path in data_files.items():
@@ -121,20 +156,9 @@ def main() -> None:
     dataset = DatasetDict(dataset_splits)
-    def add_audio(batch):
-        return {
-            "audio": [f"clips/{path}" for path in batch["path"]]
-        }
-    dataset = dataset.map(add_audio, batched=True)
     dataset = dataset.cast_column("audio", Audio())
     for split, split_ds in dataset.items():
-        columns_to_drop = [
-            col for col in split_ds.column_names if col in DROP_COLUMNS
-        ]
-        if columns_to_drop:
-            dataset[split] = split_ds.remove_columns(columns_to_drop)
     current_dir = os.getcwd()
     os.chdir(str(DATASET_DIR))
@@ -144,7 +168,7 @@ def main() -> None:
         os.chdir(current_dir)
     api.upload_file(
-        path_or_fileobj=dataset_card(repo_id).encode("utf-8"),
         path_in_repo="README.md",
         repo_id=repo_id,
         repo_type="dataset",

 from pathlib import Path
 import csv
+import re
+import sys
 from datasets import Audio, Dataset, DatasetDict
 from huggingface_hub import HfApi
 DATASET_DIR = Path(os.getenv("COMMONVOICE_DIR", "CommonVoice24-FA")).resolve()
 SPLITS = [
     split.strip()
+    for split in os.getenv("COMMONVOICE_SPLITS", "").split(",")
     if split.strip()
 ]
 REPO_OVERRIDE = os.getenv("COMMONVOICE_REPO")
 PRIVATE_REPO = os.getenv("COMMONVOICE_PRIVATE", "0") == "1"
+REQUIRED_COLUMNS = {"path", "sentence"}
+csv.field_size_limit(min(sys.maxsize, 10**7))
+PREFIX_RE = re.compile(r"^common_voice_fa_(\d+)\.mp3$")
+BUCKET_COUNT = int(os.getenv("COMMONVOICE_BUCKETS", "100"))
+BUCKET_WIDTH = max(2, len(str(max(BUCKET_COUNT - 1, 0))))
 def load_env(path: Path) -> dict:
     return data
+def dataset_card(repo_id: str, split_names: list[str]) -> str:
+    splits = ", ".join(split_names)
     return f"""---
 language:
 - fa
 This dataset is a repackaging of the Persian subset of Mozilla Common Voice 24.0.
 ## What changed
+- Added an `audio` column pointing to `clips/<bucket>/*.mp3` for easy playback in the Hub UI.
+- Only kept `audio` and `sentence` columns (in that order).
+## Splits
+{splits}
+## Notes
+Additional TSV files that do not include audio paths (e.g. reports or sentence
+metadata) are kept as raw files in the repo but are not exposed as dataset
+splits.
 ## Source
 Original data: https://huggingface.co/datasets/mozilla-foundation/common_voice_24_0
     if not DATASET_DIR.exists():
         raise SystemExit(f"Dataset dir not found: {DATASET_DIR}")
+    tsv_files = sorted(DATASET_DIR.glob("*.tsv"))
+    if SPLITS:
+        tsv_files = [
+            DATASET_DIR / f"{name}.tsv"
+            for name in SPLITS
+            if (DATASET_DIR / f"{name}.tsv").exists()
+        ]
     data_files = {}
+    for path in tsv_files:
+        with path.open("r", encoding="utf-8", errors="replace") as handle:
+            reader = csv.reader(handle, delimiter="\t")
+            header = next(reader, [])
+        if not REQUIRED_COLUMNS.issubset(header):
+            continue
+        split_name = path.stem
+        data_files[split_name] = str(path)
     if not data_files:
         raise SystemExit(
         repo_id, repo_type="dataset", private=PRIVATE_REPO, exist_ok=True
     )
+    def bucket_for_clip(clip_path: str) -> str:
+        match = PREFIX_RE.match(clip_path)
+        if not match:
+            return "misc"
+        clip_id = int(match.group(1))
+        return f"{clip_id % BUCKET_COUNT:0{BUCKET_WIDTH}d}"
     def tsv_generator(path: str):
         with open(path, "r", encoding="utf-8", errors="replace") as handle:
             reader = csv.reader(handle, delimiter="\t")
+            header = next(reader, [])
+            if not REQUIRED_COLUMNS.issubset(header):
+                return
+            path_idx = header.index("path")
+            sentence_idx = header.index("sentence")
             for row in reader:
                 if len(row) != len(header):
                     continue
+                clip_path = row[path_idx].strip()
+                sentence = row[sentence_idx].strip()
+                if not clip_path:
+                    continue
+                bucket = bucket_for_clip(clip_path)
+                yield {
+                    "audio": f"clips/{bucket}/{clip_path}",
+                    "sentence": sentence,
+                }
     dataset_splits = {}
     for split, path in data_files.items():
     dataset = DatasetDict(dataset_splits)
     dataset = dataset.cast_column("audio", Audio())
     for split, split_ds in dataset.items():
+        dataset[split] = split_ds.select_columns(["audio", "sentence"])
     current_dir = os.getcwd()
     os.chdir(str(DATASET_DIR))
         os.chdir(current_dir)
     api.upload_file(
+        path_or_fileobj=dataset_card(repo_id, sorted(data_files)).encode("utf-8"),
         path_in_repo="README.md",
         repo_id=repo_id,
         repo_type="dataset",

scripts/upload_commonvoice_chunks.py CHANGED Viewed

@@ -1,9 +1,15 @@
 import json
 import os
 import re
 from pathlib import Path
-from huggingface_hub import CommitOperationAdd, HfApi
 DATASET_DIR = Path(os.getenv("COMMONVOICE_DIR", "CommonVoice24-FA"))
@@ -14,6 +20,12 @@ REPO_OVERRIDE = os.getenv("COMMONVOICE_REPO")
 PREFIX_RE = re.compile(r"^common_voice_fa_(\d+)\.mp3$")
 CHUNK_SIZE = int(os.getenv("COMMONVOICE_CHUNK_SIZE", "2000"))
 MAX_CHUNKS = int(os.getenv("COMMONVOICE_MAX_CHUNKS", "0"))
 def load_env(path: Path) -> dict:
@@ -33,8 +45,20 @@ def load_env(path: Path) -> dict:
 def load_checkpoint(path: Path) -> dict:
     if not path.exists():
-        return {"metadata_uploaded": False, "prefixes": []}
-    return json.loads(path.read_text())
 def save_checkpoint(path: Path, data: dict) -> None:
@@ -52,6 +76,82 @@ def get_clip_files(clip_dir: Path) -> list[Path]:
     return sorted(files)
 def main() -> None:
     env = load_env(Path(".env"))
     token = (
@@ -73,6 +173,13 @@ def main() -> None:
     api.create_repo(repo_id, repo_type="dataset", exist_ok=True)
     checkpoint = load_checkpoint(CHECKPOINT_FILE)
     if not checkpoint.get("metadata_uploaded"):
         api.upload_folder(
@@ -88,6 +195,8 @@ def main() -> None:
         checkpoint["metadata_uploaded"] = True
         save_checkpoint(CHECKPOINT_FILE, checkpoint)
     clip_dir = DATASET_DIR / "clips"
     clip_files = get_clip_files(clip_dir)
     total = len(clip_files)
@@ -101,12 +210,13 @@ def main() -> None:
         batch = clip_files[start:end]
         operations = [
             CommitOperationAdd(
-                path_in_repo=f"clips/{path.name}",
                 path_or_fileobj=str(path),
             )
             for path in batch
         ]
-        api.create_commit(
             repo_id=repo_id,
             repo_type="dataset",
             operations=operations,

 import json
 import os
 import re
+import time
 from pathlib import Path
+from huggingface_hub import (
+    CommitOperationAdd,
+    CommitOperationCopy,
+    CommitOperationDelete,
+    HfApi,
+)
 DATASET_DIR = Path(os.getenv("COMMONVOICE_DIR", "CommonVoice24-FA"))
 PREFIX_RE = re.compile(r"^common_voice_fa_(\d+)\.mp3$")
 CHUNK_SIZE = int(os.getenv("COMMONVOICE_CHUNK_SIZE", "2000"))
 MAX_CHUNKS = int(os.getenv("COMMONVOICE_MAX_CHUNKS", "0"))
+BUCKET_COUNT = int(os.getenv("COMMONVOICE_BUCKETS", "100"))
+BUCKET_WIDTH = max(2, len(str(max(BUCKET_COUNT - 1, 0))))
+MOVE_BATCH_SIZE = int(os.getenv("COMMONVOICE_MOVE_BATCH", "100"))
+MIGRATE_EXISTING = os.getenv("COMMONVOICE_MIGRATE", "1") == "1"
+COMMIT_RETRIES = int(os.getenv("COMMONVOICE_COMMIT_RETRIES", "3"))
+COMMIT_SLEEP = float(os.getenv("COMMONVOICE_COMMIT_SLEEP", "5"))
 def load_env(path: Path) -> dict:
 def load_checkpoint(path: Path) -> dict:
     if not path.exists():
+        return {
+            "metadata_uploaded": False,
+            "prefixes": [],
+            "clip_index": 0,
+            "bucketed": False,
+            "bucket_count": BUCKET_COUNT,
+        }
+    data = json.loads(path.read_text())
+    data.setdefault("metadata_uploaded", False)
+    data.setdefault("prefixes", [])
+    data.setdefault("clip_index", 0)
+    data.setdefault("bucketed", False)
+    data.setdefault("bucket_count", BUCKET_COUNT)
+    return data
 def save_checkpoint(path: Path, data: dict) -> None:
     return sorted(files)
+def bucket_for_filename(filename: str) -> str:
+    match = PREFIX_RE.match(filename)
+    if not match:
+        return "misc"
+    clip_id = int(match.group(1))
+    return f"{clip_id % BUCKET_COUNT:0{BUCKET_WIDTH}d}"
+def bucketed_repo_path(filename: str) -> str:
+    bucket = bucket_for_filename(filename)
+    return f"clips/{bucket}/{filename}"
+def create_commit_with_retry(api: HfApi, **kwargs) -> None:
+    for attempt in range(1, COMMIT_RETRIES + 1):
+        try:
+            api.create_commit(**kwargs)
+            return
+        except Exception as exc:
+            if attempt >= COMMIT_RETRIES:
+                raise
+            print(
+                "Commit failed, retrying "
+                f"({attempt}/{COMMIT_RETRIES}): {exc}"
+            )
+            time.sleep(COMMIT_SLEEP)
+def migrate_root_clips(
+    api: HfApi, repo_id: str, checkpoint: dict
+) -> None:
+    if checkpoint.get("bucketed"):
+        return
+    if not MIGRATE_EXISTING:
+        return
+    repo_files = api.list_repo_files(repo_id, repo_type="dataset")
+    root_clips = [
+        path
+        for path in repo_files
+        if path.startswith("clips/")
+        and path.count("/") == 1
+        and PREFIX_RE.match(Path(path).name)
+    ]
+    if not root_clips:
+        checkpoint["bucketed"] = True
+        save_checkpoint(CHECKPOINT_FILE, checkpoint)
+        return
+    for start in range(0, len(root_clips), MOVE_BATCH_SIZE):
+        batch = root_clips[start:start + MOVE_BATCH_SIZE]
+        operations = []
+        for path in batch:
+            new_path = bucketed_repo_path(Path(path).name)
+            operations.append(
+                CommitOperationCopy(
+                    src_path_in_repo=path,
+                    path_in_repo=new_path,
+                )
+            )
+            operations.append(CommitOperationDelete(path_in_repo=path))
+        create_commit_with_retry(
+            api,
+            repo_id=repo_id,
+            repo_type="dataset",
+            operations=operations,
+            commit_message=(
+                "Move Common Voice clips into bucketed subfolders"
+            ),
+        )
+    checkpoint["bucketed"] = True
+    checkpoint["bucket_count"] = BUCKET_COUNT
+    save_checkpoint(CHECKPOINT_FILE, checkpoint)
 def main() -> None:
     env = load_env(Path(".env"))
     token = (
     api.create_repo(repo_id, repo_type="dataset", exist_ok=True)
     checkpoint = load_checkpoint(CHECKPOINT_FILE)
+    if int(checkpoint.get("bucket_count", BUCKET_COUNT)) != BUCKET_COUNT:
+        raise SystemExit(
+            "Bucket count mismatch. "
+            f"Checkpoint has {checkpoint.get('bucket_count')}, "
+            f"env has {BUCKET_COUNT}. "
+            "Set COMMONVOICE_BUCKETS to match the existing upload."
+        )
     if not checkpoint.get("metadata_uploaded"):
         api.upload_folder(
         checkpoint["metadata_uploaded"] = True
         save_checkpoint(CHECKPOINT_FILE, checkpoint)
+    migrate_root_clips(api, repo_id, checkpoint)
     clip_dir = DATASET_DIR / "clips"
     clip_files = get_clip_files(clip_dir)
     total = len(clip_files)
         batch = clip_files[start:end]
         operations = [
             CommitOperationAdd(
+                path_in_repo=bucketed_repo_path(path.name),
                 path_or_fileobj=str(path),
             )
             for path in batch
         ]
+        create_commit_with_retry(
+            api,
             repo_id=repo_id,
             repo_type="dataset",
             operations=operations,