Spaces:

manoskary
/

Woosh-DFlow

Running on Zero

App Files Files Community

manoskary commited on Apr 15

Commit

5be99b2

1 Parent(s): 59f9e42

Add initial project files including .gitignore, README, app.py, and requirements.txt

Browse files

Files changed (4) hide show

.gitignore +7 -0
README.md +37 -5
app.py +481 -0
requirements.txt +11 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+__pycache__/
+*.py[cod]
+checkpoints/
+outputs/
+.gradio/
+.codex

README.md CHANGED Viewed

@@ -1,15 +1,47 @@
 ---
 title: Woosh DFlow
-emoji: 💻
 colorFrom: red
-colorTo: gray
 sdk: gradio
 sdk_version: 6.12.0
-python_version: '3.12'
 app_file: app.py
 pinned: false
 license: mit
-short_description: 'Woosh: Sound Effect Generative Model '
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: Woosh DFlow
+emoji: 🔊
 colorFrom: red
+colorTo: green
 sdk: gradio
 sdk_version: 6.12.0
+python_version: '3.12.12'
 app_file: app.py
 pinned: false
 license: mit
+fullWidth: true
+startup_duration_timeout: 1h
+short_description: 'Woosh-DFlow text-to-audio sound effect generation'
+tags:
+  - audio-generation
+  - text-to-audio
+  - gradio
+  - zerogpu
 ---
+# Woosh-DFlow
+Text-to-audio sound effect generation with Sony AI's distilled Woosh-DFlow model.
+The app downloads the official `Woosh-DFlow.zip` checkpoint from the
+`SonyResearch/Woosh` v1.0.0 GitHub release when the Space starts, then loads the
+model for ZeroGPU inference. The first build or cold start can take a while
+because the checkpoint is about 1.2 GB.
+## Notes
+- Inference is decorated with `@spaces.GPU`, so select ZeroGPU hardware in the
+  Space settings.
+- The Woosh inference source is installed from the upstream GitHub repository at
+  a pinned commit.
+- The upstream code is MIT/Apache-2.0. The released model weights are
+  CC-BY-NC, as stated by the upstream project.
+## Local Run
+```bash
+python app.py
+```
+Use `WOOSH_CHECKPOINT_DIR=/path/to/checkpoints/Woosh-DFlow` to point the app at
+an existing checkpoint directory.

app.py ADDED Viewed

	@@ -0,0 +1,481 @@

+"""Woosh-DFlow text-to-audio Space."""
+from __future__ import annotations
+import argparse
+import logging
+import os
+import shutil
+import threading
+import time
+import zipfile
+from pathlib import Path
+from typing import Callable
+try:
+    import spaces
+except ImportError:  # Allows syntax checks and local CPU runs without ZeroGPU helpers.
+    class _SpacesFallback:
+        @staticmethod
+        def GPU(*args, **kwargs):
+            if args and callable(args[0]) and len(args) == 1 and not kwargs:
+                return args[0]
+            def decorator(fn: Callable):
+                return fn
+            return decorator
+    spaces = _SpacesFallback()
+import gradio as gr
+import requests
+import torch
+from woosh.components.base import LoadConfig
+from woosh.inference.flowmap_sampler import sample_euler
+from woosh.model.flowmap_from_pretrained import FlowMapFromPretrained
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("woosh_space")
+APP_DIR = Path(__file__).resolve().parent
+CHECKPOINT_NAME = "Woosh-DFlow"
+DEFAULT_CHECKPOINT_URL = (
+    "https://github.com/SonyResearch/Woosh/releases/download/v1.0.0/"
+    "Woosh-DFlow.zip"
+)
+CHECKPOINT_URL = os.getenv("WOOSH_CHECKPOINT_URL", DEFAULT_CHECKPOINT_URL)
+SAMPLE_RATE = 48_000
+LATENT_CHANNELS = 128
+LATENT_FRAMES = 501
+GENERATION_STEPS = 4
+RENOISE_SCHEDULE = [0.0, 0.5, 0.5, 0.3]
+MAX_VARIANTS = 2
+_model = None
+_device = None
+_model_lock = threading.Lock()
+_startup_error: str | None = None
+def _resolve_app_path(value: str) -> Path:
+    path = Path(value).expanduser()
+    if path.is_absolute():
+        return path
+    return APP_DIR / path
+CHECKPOINT_DIR = _resolve_app_path(
+    os.getenv("WOOSH_CHECKPOINT_DIR", f"checkpoints/{CHECKPOINT_NAME}")
+)
+def _checkpoint_ready(path: Path) -> bool:
+    return path.exists() and path.is_dir() and any(path.iterdir())
+def _download_file(url: str, destination: Path) -> None:
+    destination.parent.mkdir(parents=True, exist_ok=True)
+    tmp_path = destination.with_suffix(destination.suffix + ".partial")
+    log.info("Downloading %s to %s", url, destination)
+    with requests.get(url, stream=True, timeout=(10, 120)) as response:
+        response.raise_for_status()
+        total = int(response.headers.get("content-length", 0))
+        downloaded = 0
+        last_log = time.perf_counter()
+        with tmp_path.open("wb") as handle:
+            for chunk in response.iter_content(chunk_size=8 * 1024 * 1024):
+                if not chunk:
+                    continue
+                handle.write(chunk)
+                downloaded += len(chunk)
+                now = time.perf_counter()
+                if now - last_log > 10:
+                    if total:
+                        pct = downloaded / total * 100
+                        log.info("Checkpoint download %.1f%%", pct)
+                    else:
+                        log.info(
+                            "Checkpoint download %.1f MB",
+                            downloaded / 1024 / 1024,
+                        )
+                    last_log = now
+    tmp_path.replace(destination)
+def ensure_checkpoint(path: Path | None = None) -> Path:
+    if path is None:
+        path = CHECKPOINT_DIR
+    if _checkpoint_ready(path):
+        return path
+    archive_path = APP_DIR / "checkpoints" / ".downloads" / f"{CHECKPOINT_NAME}.zip"
+    if not archive_path.exists():
+        _download_file(CHECKPOINT_URL, archive_path)
+    log.info("Extracting %s", archive_path)
+    APP_DIR.mkdir(parents=True, exist_ok=True)
+    try:
+        with zipfile.ZipFile(archive_path) as archive:
+            archive.extractall(APP_DIR)
+    except zipfile.BadZipFile:
+        log.warning("Checkpoint archive was invalid; downloading it again.")
+        archive_path.unlink(missing_ok=True)
+        _download_file(CHECKPOINT_URL, archive_path)
+        with zipfile.ZipFile(archive_path) as archive:
+            archive.extractall(APP_DIR)
+    if _checkpoint_ready(path):
+        return path
+    candidates = [
+        candidate
+        for candidate in APP_DIR.rglob(CHECKPOINT_NAME)
+        if candidate.is_dir() and candidate != path
+    ]
+    if candidates:
+        path.parent.mkdir(parents=True, exist_ok=True)
+        shutil.move(str(candidates[0]), str(path))
+    if not _checkpoint_ready(path):
+        raise RuntimeError(
+            f"Could not find {CHECKPOINT_NAME} after extracting {archive_path}."
+        )
+    return path
+def select_device() -> str:
+    if torch.cuda.is_available():
+        return "cuda"
+    mps = getattr(torch.backends, "mps", None)
+    if mps is not None and mps.is_available():
+        return "mps"
+    return "cpu"
+def get_model():
+    global _device, _model
+    with _model_lock:
+        if _model is not None:
+            return _model, _device
+        checkpoint_path = ensure_checkpoint()
+        _device = select_device()
+        log.info("Loading %s on %s", checkpoint_path, _device)
+        model = FlowMapFromPretrained(LoadConfig(path=str(checkpoint_path)))
+        _model = model.eval().to(_device)
+        log.info("Model loaded")
+        return _model, _device
+def _seed_everything(seed: int) -> int:
+    if seed < 0:
+        seed = int.from_bytes(os.urandom(4), "big") % (2**31)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    return seed
+def _format_audio_batch(audio: torch.Tensor) -> list[tuple[int, object]]:
+    audio = audio.detach().cpu().float()
+    outputs = []
+    for sample in audio:
+        peak = sample.abs().max().clamp(min=1.0)
+        sample = (sample / peak).clamp(-1.0, 1.0)
+        if sample.ndim == 2 and sample.shape[0] == 1:
+            sample = sample.squeeze(0)
+        elif sample.ndim == 2:
+            sample = sample.transpose(0, 1)
+        outputs.append((SAMPLE_RATE, sample.numpy()))
+    return outputs
+@spaces.GPU(duration=120)
+@torch.inference_mode()
+def generate(
+    prompt: str,
+    variants: int,
+    cfg_scale: float,
+    seed: int,
+    progress=gr.Progress(track_tqdm=False),
+):
+    prompt = (prompt or "").strip()
+    if not prompt:
+        raise gr.Error("Enter a short sound description.")
+    variants = max(1, min(int(variants), MAX_VARIANTS))
+    cfg_scale = float(cfg_scale)
+    seed = _seed_everything(int(seed))
+    try:
+        model, device = get_model()
+    except Exception as exc:
+        raise gr.Error(f"Could not load Woosh-DFlow: {exc}") from exc
+    progress(0.1, desc="Preparing text conditioning")
+    noise = torch.randn(variants, LATENT_CHANNELS, LATENT_FRAMES, device=device)
+    cond = model.get_cond(
+        {"audio": None, "description": [prompt] * variants},
+        no_dropout=True,
+        device=device,
+    )
+    progress(0.35, desc="Synthesizing latent audio")
+    start_time = time.perf_counter()
+    latents = sample_euler(
+        model=model,
+        noise=noise,
+        cond=cond,
+        num_steps=GENERATION_STEPS,
+        renoise=RENOISE_SCHEDULE,
+        cfg=cfg_scale,
+    )
+    progress(0.75, desc="Decoding waveform")
+    audio = model.autoencoder.inverse(latents)
+    elapsed = time.perf_counter() - start_time
+    outputs = _format_audio_batch(audio)
+    if device == "cuda":
+        torch.cuda.empty_cache()
+    audio_updates = [
+        gr.update(value=value, visible=True) for value in outputs[:MAX_VARIANTS]
+    ]
+    while len(audio_updates) < MAX_VARIANTS:
+        audio_updates.append(gr.update(value=None, visible=False))
+    details = (
+        f"Generated {variants} take{'s' if variants != 1 else ''} "
+        f"in {elapsed:.1f}s on {device}. "
+        f"Seed: `{seed}`. Steps: `{GENERATION_STEPS}`. "
+        f"Sample rate: `{SAMPLE_RATE} Hz`."
+    )
+    progress(1.0, desc="Done")
+    return [*audio_updates, details]
+def build_ui() -> gr.Blocks:
+    css = """
+    .gradio-container {
+        max-width: 1180px !important;
+    }
+    #hero {
+        padding: 28px;
+        border: 1px solid #d8e3df;
+        border-radius: 8px;
+        background: linear-gradient(135deg, #ffffff 0%, #f1faf7 100%);
+    }
+    #hero h1 {
+        margin: 0 0 10px;
+        font-size: 2.35rem;
+        line-height: 1.05;
+        letter-spacing: 0;
+        color: #202124;
+    }
+    #hero p {
+        margin: 0;
+        color: #3a4140;
+        font-size: 1.02rem;
+        line-height: 1.55;
+    }
+    #hero .meta {
+        margin-top: 14px;
+        color: #007a7a;
+        font-weight: 650;
+    }
+    .primary-button {
+        min-height: 48px;
+    }
+    """
+    theme = gr.themes.Soft(
+        primary_hue="red",
+        secondary_hue="teal",
+        neutral_hue="gray",
+        radius_size="sm",
+    )
+    with gr.Blocks(
+        title="Woosh-DFlow",
+        theme=theme,
+        css=css,
+        analytics_enabled=False,
+    ) as demo:
+        gr.HTML(
+            """
+            <section id="hero">
+                <h1>Woosh-DFlow</h1>
+                <p>
+                    Fast text-to-audio generation for sound effects, ambience,
+                    impacts, machines, weather, Foley, and synthetic UI sounds.
+                </p>
+                <p class="meta">
+                    Distilled Woosh model by Sony AI. Outputs are five-second,
+                    48 kHz audio clips.
+                </p>
+            </section>
+            """
+        )
+        with gr.Row(equal_height=False):
+            with gr.Column(scale=7):
+                prompt = gr.Textbox(
+                    label="Sound prompt",
+                    placeholder="A heavy metal door slams shut in a concrete hallway",
+                    lines=4,
+                    max_lines=6,
+                )
+                run_button = gr.Button(
+                    "Generate sound",
+                    variant="primary",
+                    elem_classes=["primary-button"],
+                )
+                gr.Examples(
+                    examples=[
+                        "sportscar engine revving and driving away quickly",
+                        "heavy rain on a tin roof with distant thunder",
+                        "large wooden door creaking open in an empty hallway",
+                        "arcade laser blast with a bright digital tail",
+                    ],
+                    inputs=prompt,
+                )
+            with gr.Column(scale=3):
+                variants = gr.Slider(
+                    minimum=1,
+                    maximum=MAX_VARIANTS,
+                    step=1,
+                    value=1,
+                    label="Takes",
+                    info="Generate one or two variations per request.",
+                )
+                cfg_scale = gr.Slider(
+                    minimum=0.0,
+                    maximum=9.0,
+                    step=0.1,
+                    value=4.5,
+                    label="Prompt strength",
+                    info="Higher values follow the prompt more tightly.",
+                )
+                seed = gr.Number(
+                    value=-1,
+                    label="Seed",
+                    precision=0,
+                    info="Use -1 for a random seed.",
+                )
+        with gr.Row():
+            audio_1 = gr.Audio(
+                label="Take 1",
+                type="numpy",
+                format="wav",
+                autoplay=True,
+                interactive=False,
+            )
+            audio_2 = gr.Audio(
+                label="Take 2",
+                type="numpy",
+                format="wav",
+                visible=False,
+                interactive=False,
+            )
+        initial_details = (
+            "The first request may wait while the official DFlow checkpoint "
+            "is downloaded and loaded."
+        )
+        if _startup_error is not None:
+            initial_details = (
+                "Model preload failed. Generation will retry the download and "
+                f"load step. Error: `{_startup_error}`"
+            )
+        run_details = gr.Markdown(value=initial_details)
+        inputs = [prompt, variants, cfg_scale, seed]
+        outputs = [audio_1, audio_2, run_details]
+        prompt.submit(
+            fn=generate,
+            inputs=inputs,
+            outputs=outputs,
+            api_name="generate",
+            show_progress="full",
+        )
+        run_button.click(
+            fn=generate,
+            inputs=inputs,
+            outputs=outputs,
+            api_name="generate_click",
+            show_progress="full",
+        )
+        gr.Markdown(
+            """
+            Model weights are downloaded from the official
+            `SonyResearch/Woosh` v1.0.0 release. The released weights are
+            licensed CC-BY-NC; the upstream inference code is MIT/Apache-2.0.
+            """
+        )
+    return demo
+def eager_load_model() -> None:
+    global _startup_error
+    if os.getenv("WOOSH_EAGER_LOAD", "1").lower() in {"0", "false", "no"}:
+        return
+    try:
+        get_model()
+    except Exception as exc:  # Keep the Space UI reachable with a clear error.
+        _startup_error = str(exc)
+        log.exception("Model preload failed")
+def main() -> None:
+    global CHECKPOINT_DIR
+    parser = argparse.ArgumentParser(description="Woosh-DFlow Gradio Space")
+    parser.add_argument(
+        "--checkpoint",
+        type=str,
+        default=str(CHECKPOINT_DIR),
+        help="Path to the Woosh-DFlow checkpoint directory.",
+    )
+    parser.add_argument("--share", action="store_true", help="Create a public link.")
+    parser.add_argument(
+        "--server-name",
+        default=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
+        help="Server address to bind.",
+    )
+    parser.add_argument(
+        "--server-port",
+        type=int,
+        default=int(os.getenv("GRADIO_SERVER_PORT", "7860")),
+        help="Server port.",
+    )
+    args = parser.parse_args()
+    CHECKPOINT_DIR = _resolve_app_path(args.checkpoint)
+    eager_load_model()
+    demo = build_ui()
+    demo.queue(default_concurrency_limit=1, max_size=12).launch(
+        show_error=True,
+        share=args.share,
+        server_name=args.server_name,
+        server_port=args.server_port,
+    )
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+--extra-index-url https://download.pytorch.org/whl/cu128
+torch==2.8.0
+torchaudio==2.8.0
+torchvision==0.23.0
+gradio==6.12.0
+spaces==0.48.2
+requests>=2.31.0
+soundfile>=0.13.1
+woosh @ git+https://github.com/SonyResearch/Woosh.git@88006c57774a85bede9f87733c019664410d6f4e