Spaces:

Reza2kn
/

representation-chizzler

Running on Zero

App Files Files Community

Reza2kn commited on Jan 4

Commit

fba9ebe

verified ·

1 Parent(s): 2b6e54a

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

.gitignore +1 -0
app.py +216 -70

.gitignore CHANGED Viewed

@@ -26,6 +26,7 @@ build/
 *.wav
 *.mp3
 *.flac
 *.ogg
 # macOS

 *.wav
 *.mp3
 *.flac
+chizzler_cache/
 *.ogg
 # macOS

app.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import json
 import os
 import subprocess
 import sys
 import tempfile
@@ -13,7 +16,16 @@ import numpy as np
 import soundfile as sf
 import torch
 import torchaudio
-from datasets import Audio, Dataset, DatasetDict, load_dataset
 from dotenv import load_dotenv
 from huggingface_hub import HfApi, hf_hub_download
 from rich.console import Console
@@ -79,6 +91,7 @@ DEFAULT_MP_SENET_DIR = Path(os.getenv("MPSENET_DIR", CURRENT_DIR / "MP-SENet"))
 MPSENET_GIT_REPO = os.getenv(
     "MPSENET_GIT_REPO", "https://github.com/yxlu-0102/MP-SENet.git"
 )
 def ensure_mpsenet_repo() -> Path:
@@ -410,37 +423,75 @@ def process_audio_file(
     return audio_path, vad_path, denoised_path, details
-def prepare_waveform_from_audio(audio_dict: dict) -> Tuple[torch.Tensor, int]:
-    if not audio_dict:
-        raise ValueError("Empty audio entry.")
-    array = audio_dict.get("array")
-    sample_rate = audio_dict.get("sampling_rate", DEFAULT_SAMPLE_RATE)
-    waveform = torch.tensor(array, dtype=torch.float32)
     waveform = ensure_mono(waveform)
     if sample_rate != DEFAULT_SAMPLE_RATE:
         waveform, sample_rate = resample_waveform(
             waveform, sample_rate, DEFAULT_SAMPLE_RATE
         )
     return waveform, sample_rate
 def infer_audio_column(dataset_obj) -> Optional[str]:
     sample_ds = dataset_obj
-    if isinstance(dataset_obj, DatasetDict):
         sample_ds = next(iter(dataset_obj.values()))
-    if isinstance(sample_ds, Dataset):
         for column, feature in sample_ds.features.items():
             if isinstance(feature, Audio):
                 return column
-        if len(sample_ds) > 0:
-            sample = sample_ds[0]
-            for column, value in sample.items():
-                if isinstance(value, dict) and (
-                    "array" in value or "path" in value or "bytes" in value
-                ):
-                    return column
-                if isinstance(value, str) and value.lower().endswith(AUDIO_EXTENSIONS):
-                    return column
     return None
@@ -463,6 +514,8 @@ def process_dataset_and_push(
     vad_threshold: float,
     max_silence_gap: float,
     max_examples: Optional[float],
     progress=gr.Progress(),
 ) -> str:
     token = get_hf_token()
@@ -477,9 +530,11 @@ def process_dataset_and_push(
     split = split.strip()
     audio_column = audio_column.strip()
     output_repo = normalize_dataset_id(output_repo) if output_repo else ""
     log_progress(f"Loading dataset: {dataset_id}")
-    progress(0, desc="Loading dataset...")
     if split and split.lower() != "all":
         dataset_obj = load_dataset(
             dataset_id, name=config, split=split, token=token
@@ -492,6 +547,7 @@ def process_dataset_and_push(
             if isinstance(dataset_obj, Dataset)
             else dataset_obj
         )
     if not audio_column:
         audio_column = infer_audio_column(dataset_dict) or ""
@@ -502,68 +558,150 @@ def process_dataset_and_push(
             )
     processed_splits = {}
     for split_name, split_ds in dataset_dict.items():
-        if audio_column not in split_ds.column_names:
             return f"Audio column '{audio_column}' not found in split '{split_name}'."
-        split_ds = split_ds.cast_column(
-            audio_column, Audio(sampling_rate=DEFAULT_SAMPLE_RATE)
-        )
-        if max_examples and max_examples > 0:
-            limit = min(int(max_examples), len(split_ds))
-            split_ds = split_ds.select(range(limit))
-        total = len(split_ds)
-        update_every = max(1, total // 100) if total else 1
-        def map_fn(example, idx):
-            try:
-                waveform, sample_rate = prepare_waveform_from_audio(
-                    example[audio_column]
                 )
-            except Exception:
-                return {audio_column: example[audio_column]}
-            vad_waveform, denoised_waveform, _, has_speech = process_waveform(
-                waveform,
-                sample_rate,
-                threshold=vad_threshold,
-                max_gap=max_silence_gap,
-                log=False,
-            )
-            output_waveform = (
-                denoised_waveform
-                if has_speech and denoised_waveform is not None
-                else waveform
             )
-            output_np = (
-                output_waveform.squeeze()
-                .detach()
-                .cpu()
-                .numpy()
-                .astype(np.float32)
             )
-            if total and (idx % update_every == 0 or idx == total - 1):
-                progress(
-                    (idx + 1) / total,
-                    desc=f"Processing {split_name}: {idx + 1}/{total}",
-                )
-            return {
-                audio_column: {
-                    "array": output_np,
-                    "sampling_rate": DEFAULT_SAMPLE_RATE,
-                }
-            }
-        processed_split = split_ds.map(
-            map_fn,
-            with_indices=True,
-            desc=f"Chizzling {split_name}",
-            num_proc=1,
-        )
         processed_splits[split_name] = processed_split
     processed_dataset = (
@@ -649,6 +787,12 @@ with gr.Blocks(title="Representation Chizzler") as demo:
         max_examples_input = gr.Number(
             label="Max examples per split (optional)", value=None
         )
         vad_slider_ds = gr.Slider(
             minimum=0.1,
             maximum=0.9,
@@ -678,6 +822,8 @@ with gr.Blocks(title="Representation Chizzler") as demo:
                 vad_slider_ds,
                 gap_slider_ds,
                 max_examples_input,
             ],
             outputs=[status_box],
             concurrency_limit=1,

+import io
 import json
+import math
 import os
+import shutil
 import subprocess
 import sys
 import tempfile
 import soundfile as sf
 import torch
 import torchaudio
+from datasets import (
+    Audio,
+    Dataset,
+    DatasetDict,
+    IterableDataset,
+    IterableDatasetDict,
+    Value,
+    concatenate_datasets,
+    load_dataset,
+)
 from dotenv import load_dotenv
 from huggingface_hub import HfApi, hf_hub_download
 from rich.console import Console
 MPSENET_GIT_REPO = os.getenv(
     "MPSENET_GIT_REPO", "https://github.com/yxlu-0102/MP-SENet.git"
 )
+CACHE_DIR = Path(os.getenv("CHIZZLER_CACHE_DIR", CURRENT_DIR / "chizzler_cache"))
 def ensure_mpsenet_repo() -> Path:
     return audio_path, vad_path, denoised_path, details
+def load_audio_bytes(audio_bytes: bytes, log: bool = False) -> Tuple[torch.Tensor, int]:
+    data, sample_rate = sf.read(
+        io.BytesIO(audio_bytes), always_2d=True, dtype="float32"
+    )
+    waveform = torch.from_numpy(data.T)
     waveform = ensure_mono(waveform)
     if sample_rate != DEFAULT_SAMPLE_RATE:
+        log_progress(
+            f"Resampling from {sample_rate}Hz to {DEFAULT_SAMPLE_RATE}Hz...",
+            2,
+            enabled=log,
+        )
         waveform, sample_rate = resample_waveform(
             waveform, sample_rate, DEFAULT_SAMPLE_RATE
         )
     return waveform, sample_rate
+def prepare_waveform_from_entry(entry, log: bool = False) -> Tuple[torch.Tensor, int]:
+    if entry is None:
+        raise ValueError("Empty audio entry.")
+    if isinstance(entry, dict):
+        if entry.get("array") is not None:
+            sample_rate = entry.get("sampling_rate", DEFAULT_SAMPLE_RATE)
+            waveform = torch.tensor(entry["array"], dtype=torch.float32)
+            waveform = ensure_mono(waveform)
+            if sample_rate != DEFAULT_SAMPLE_RATE:
+                waveform, sample_rate = resample_waveform(
+                    waveform, sample_rate, DEFAULT_SAMPLE_RATE
+                )
+            return waveform, sample_rate
+        if entry.get("path"):
+            return load_audio_file(entry["path"], log=log)
+        if entry.get("bytes"):
+            return load_audio_bytes(entry["bytes"], log=log)
+    if isinstance(entry, str):
+        return load_audio_file(entry, log=log)
+    raise ValueError("Unsupported audio entry format.")
+def get_dataset_cache_dir(dataset_id: str, config: Optional[str]) -> Path:
+    slug = dataset_id.replace("/", "__")
+    if config:
+        slug = f"{slug}__{config}"
+    return CACHE_DIR / slug
 def infer_audio_column(dataset_obj) -> Optional[str]:
     sample_ds = dataset_obj
+    if isinstance(dataset_obj, (DatasetDict, IterableDatasetDict)):
         sample_ds = next(iter(dataset_obj.values()))
+    if hasattr(sample_ds, "features"):
         for column, feature in sample_ds.features.items():
             if isinstance(feature, Audio):
                 return column
+    if isinstance(sample_ds, Dataset) and len(sample_ds) > 0:
+        sample = sample_ds[0]
+        for column, value in sample.items():
+            if isinstance(value, dict) and (
+                "array" in value or "path" in value or "bytes" in value
+            ):
+                return column
+            if isinstance(value, str) and value.lower().endswith(AUDIO_EXTENSIONS):
+                return column
     return None
     vad_threshold: float,
     max_silence_gap: float,
     max_examples: Optional[float],
+    resume_processing: bool,
+    shard_size: Optional[float],
     progress=gr.Progress(),
 ) -> str:
     token = get_hf_token()
     split = split.strip()
     audio_column = audio_column.strip()
     output_repo = normalize_dataset_id(output_repo) if output_repo else ""
+    max_examples_int = int(max_examples) if max_examples and max_examples > 0 else None
+    shard_size_int = int(shard_size) if shard_size and shard_size > 0 else 1000
     log_progress(f"Loading dataset: {dataset_id}")
+    progress(0, desc="Downloading dataset...")
     if split and split.lower() != "all":
         dataset_obj = load_dataset(
             dataset_id, name=config, split=split, token=token
             if isinstance(dataset_obj, Dataset)
             else dataset_obj
         )
+    progress(0.01, desc="Preparing splits...")
     if not audio_column:
         audio_column = infer_audio_column(dataset_dict) or ""
             )
     processed_splits = {}
+    cache_root = get_dataset_cache_dir(dataset_id, config)
+    cache_root.mkdir(parents=True, exist_ok=True)
     for split_name, split_ds in dataset_dict.items():
+        if (
+            hasattr(split_ds, "column_names")
+            and audio_column not in split_ds.column_names
+        ):
             return f"Audio column '{audio_column}' not found in split '{split_name}'."
+        try:
+            split_ds = split_ds.cast_column(audio_column, Audio(decode=False))
+        except Exception:
+            split_ds = split_ds.cast_column(
+                audio_column, Audio(sampling_rate=DEFAULT_SAMPLE_RATE, decode=False)
+            )
+        total = len(split_ds) if isinstance(split_ds, Dataset) else None
+        if max_examples_int and total is not None:
+            total = min(total, max_examples_int)
+        update_every = max(1, (total or max_examples_int or 100) // 100)
+        split_cache_dir = cache_root / split_name
+        if not resume_processing and split_cache_dir.exists():
+            shutil.rmtree(split_cache_dir)
+        split_cache_dir.mkdir(parents=True, exist_ok=True)
+        features = split_ds.features.copy()
+        features[audio_column] = Audio(
+            sampling_rate=DEFAULT_SAMPLE_RATE, decode=False
+        )
+        features["chizzler_ok"] = Value("bool")
+        features["chizzler_error"] = Value("string")
+        def make_map_fn(offset: int = 0):
+            def map_fn(example, idx):
+                entry = example.get(audio_column)
+                ok = True
+                error_message = ""
+                try:
+                    waveform, sample_rate = prepare_waveform_from_entry(
+                        entry, log=False
+                    )
+                    vad_waveform, denoised_waveform, _, has_speech = process_waveform(
+                        waveform,
+                        sample_rate,
+                        threshold=vad_threshold,
+                        max_gap=max_silence_gap,
+                        log=False,
+                    )
+                    output_waveform = (
+                        denoised_waveform
+                        if has_speech and denoised_waveform is not None
+                        else waveform
+                    )
+                    output_np = (
+                        output_waveform.squeeze()
+                        .detach()
+                        .cpu()
+                        .numpy()
+                        .astype(np.float32)
+                    )
+                except Exception as exc:
+                    ok = False
+                    error_message = str(exc)
+                    output_np = np.zeros(1, dtype=np.float32)
+                example[audio_column] = {
+                    "array": output_np,
+                    "sampling_rate": DEFAULT_SAMPLE_RATE,
+                }
+                example["chizzler_ok"] = ok
+                example["chizzler_error"] = error_message
+                global_idx = offset + idx + 1
+                if total:
+                    if global_idx % update_every == 0 or global_idx == total:
+                        progress(
+                            global_idx / total,
+                            desc=(
+                                f"Processing {split_name}: {global_idx}/{total}"
+                            ),
+                        )
+                else:
+                    if global_idx % update_every == 0:
+                        progress(
+                            0,
+                            desc=f"Processing {split_name}: {global_idx} examples",
+                        )
+                return example
+            return map_fn
+        if total:
+            num_shards = math.ceil(total / shard_size_int)
+            shards = []
+            for shard_idx in range(num_shards):
+                start = shard_idx * shard_size_int
+                end = min(total, start + shard_size_int)
+                cache_file = split_cache_dir / (
+                    f"{split_name}-{start:07d}-{end:07d}.arrow"
                 )
+                if resume_processing and cache_file.exists():
+                    processed_shard = Dataset.from_file(str(cache_file))
+                    progress(
+                        end / total,
+                        desc=f"Processing {split_name}: {end}/{total}",
+                    )
+                else:
+                    shard_ds = split_ds.select(list(range(start, end)))
+                    processed_shard = shard_ds.map(
+                        make_map_fn(offset=start),
+                        with_indices=True,
+                        load_from_cache_file=False,
+                        cache_file_name=str(cache_file),
+                        writer_batch_size=50,
+                        num_proc=None,
+                        features=features,
+                        desc=(
+                            f"Chizzling {split_name} "
+                            f"({shard_idx + 1}/{num_shards})"
+                        ),
+                    )
+                shards.append(processed_shard)
+            processed_split = (
+                concatenate_datasets(shards)
+                if len(shards) > 1
+                else shards[0]
             )
+        else:
+            processed_split = split_ds.map(
+                make_map_fn(offset=0),
+                with_indices=True,
+                load_from_cache_file=False,
+                writer_batch_size=50,
+                num_proc=None,
+                features=features,
+                desc=f"Chizzling {split_name}",
             )
         processed_splits[split_name] = processed_split
     processed_dataset = (
         max_examples_input = gr.Number(
             label="Max examples per split (optional)", value=None
         )
+        resume_checkbox = gr.Checkbox(
+            label="Resume from cached shards", value=True
+        )
+        shard_size_input = gr.Number(
+            label="Shard size (examples)", value=1000
+        )
         vad_slider_ds = gr.Slider(
             minimum=0.1,
             maximum=0.9,
                 vad_slider_ds,
                 gap_slider_ds,
                 max_examples_input,
+                resume_checkbox,
+                shard_size_input,
             ],
             outputs=[status_box],
             concurrency_limit=1,