Spaces:
Sleeping
Sleeping
orrp commited on
Commit ·
e823eac
1
Parent(s): 1932c0c
Switching to pyproject.toml
Browse files- pyproject.toml +61 -0
- requirements.txt +0 -16
- setup.py +0 -33
- vampnet/.pre-commit-config.yaml +13 -14
- vampnet/app.py +62 -88
pyproject.toml
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=61.0", "wheel", "Cython"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "wham"
|
| 7 |
+
version = "0.0.1"
|
| 8 |
+
description = "Towards A Translative Model of Sperm Whale Vocalization"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
authors = [
|
| 11 |
+
{ name = "Project CETI" }
|
| 12 |
+
]
|
| 13 |
+
license = { text = "MIT" }
|
| 14 |
+
requires-python = ">=3.9"
|
| 15 |
+
dependencies = [
|
| 16 |
+
"torch",
|
| 17 |
+
"gradio",
|
| 18 |
+
"argbind>=0.3.2",
|
| 19 |
+
"numpy<1.24",
|
| 20 |
+
"pydantic>=2.0,<3",
|
| 21 |
+
"huggingface_hub",
|
| 22 |
+
"loralib",
|
| 23 |
+
"torch_pitch_shift",
|
| 24 |
+
"soundfile",
|
| 25 |
+
"pydub",
|
| 26 |
+
"tqdm",
|
| 27 |
+
"Cython",
|
| 28 |
+
"pandas",
|
| 29 |
+
"pathlib",
|
| 30 |
+
"ffmpeg-python",
|
| 31 |
+
"scikit-learn",
|
| 32 |
+
"wandb",
|
| 33 |
+
"gdown",
|
| 34 |
+
"transformers",
|
| 35 |
+
"fadtk",
|
| 36 |
+
"urllib3==2.0",
|
| 37 |
+
"plotly",
|
| 38 |
+
"pyharp",
|
| 39 |
+
# Git-based dependencies
|
| 40 |
+
"wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat.git",
|
| 41 |
+
"lac @ git+https://github.com/hugofloresgarcia/lac.git",
|
| 42 |
+
"descript-audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git"
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
[tool.setuptools.packages.find]
|
| 46 |
+
where = ["."]
|
| 47 |
+
include = ["wham*", "vampnet*"]
|
| 48 |
+
|
| 49 |
+
[tool.ruff]
|
| 50 |
+
# Target Python 3.9+
|
| 51 |
+
target-version = "py39"
|
| 52 |
+
line-length = 88
|
| 53 |
+
|
| 54 |
+
[tool.ruff.lint]
|
| 55 |
+
# Enable Pyflakes (F), pycodestyle (E, W), and isort (I)
|
| 56 |
+
select = ["E", "F", "W", "I"]
|
| 57 |
+
ignore = []
|
| 58 |
+
|
| 59 |
+
[tool.ruff.format]
|
| 60 |
+
quote-style = "double"
|
| 61 |
+
indent-style = "space"
|
requirements.txt
DELETED
|
@@ -1,16 +0,0 @@
|
|
| 1 |
-
torch
|
| 2 |
-
gradio
|
| 3 |
-
argbind>=0.3.2
|
| 4 |
-
numpy<1.24
|
| 5 |
-
pydantic>=2.0,<3
|
| 6 |
-
huggingface_hub
|
| 7 |
-
loralib
|
| 8 |
-
torch_pitch_shift
|
| 9 |
-
soundfile
|
| 10 |
-
pydub
|
| 11 |
-
tqdm
|
| 12 |
-
Cython
|
| 13 |
-
wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat
|
| 14 |
-
lac @ git+https://github.com/hugofloresgarcia/lac.git
|
| 15 |
-
descript-audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git
|
| 16 |
-
pyharp
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
setup.py
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
from setuptools import setup, find_packages
|
| 2 |
-
|
| 3 |
-
with open("README.md") as f:
|
| 4 |
-
long_description = f.read()
|
| 5 |
-
|
| 6 |
-
setup(
|
| 7 |
-
name="wham",
|
| 8 |
-
version="0.0.1",
|
| 9 |
-
long_description=long_description,
|
| 10 |
-
long_description_content_type="text/markdown",
|
| 11 |
-
url="https://github.com/orrp/wam",
|
| 12 |
-
license="MIT",
|
| 13 |
-
packages=find_packages(),
|
| 14 |
-
package_dir={},
|
| 15 |
-
install_requires=[
|
| 16 |
-
"descript-audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git",
|
| 17 |
-
"argbind",
|
| 18 |
-
"pandas",
|
| 19 |
-
"pathlib",
|
| 20 |
-
"pydub",
|
| 21 |
-
"ffmpeg-python",
|
| 22 |
-
"tqdm",
|
| 23 |
-
"scikit-learn",
|
| 24 |
-
"wandb",
|
| 25 |
-
"gdown", # For fetching large files from Google Drive
|
| 26 |
-
"soundfile",
|
| 27 |
-
"transformers",
|
| 28 |
-
"torch",
|
| 29 |
-
"Cython",
|
| 30 |
-
"fadtk",
|
| 31 |
-
"urllib3==2.0"
|
| 32 |
-
],
|
| 33 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vampnet/.pre-commit-config.yaml
CHANGED
|
@@ -1,15 +1,14 @@
|
|
| 1 |
repos:
|
| 2 |
-
- repo: https://github.com/
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
- id: trailing-whitespace
|
|
|
|
| 1 |
repos:
|
| 2 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 3 |
+
rev: v0.3.0 # Use the latest version available
|
| 4 |
+
hooks:
|
| 5 |
+
# Run the linter and import sorter
|
| 6 |
+
- id: ruff
|
| 7 |
+
args: [--fix]
|
| 8 |
+
# Run the formatter (replaces black)
|
| 9 |
+
- id: ruff-format
|
| 10 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 11 |
+
rev: v4.5.0
|
| 12 |
+
hooks:
|
| 13 |
+
- id: end-of-file-fixer
|
| 14 |
+
- id: trailing-whitespace
|
|
|
vampnet/app.py
CHANGED
|
@@ -1,36 +1,12 @@
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
|
| 4 |
-
try:
|
| 5 |
-
import spaces
|
| 6 |
-
ZERO_GPU = True
|
| 7 |
-
except ImportError:
|
| 8 |
-
ZERO_GPU = False
|
| 9 |
-
|
| 10 |
-
def gpu(fn):
|
| 11 |
-
if ZERO_GPU:
|
| 12 |
-
return spaces.GPU(fn)
|
| 13 |
-
return fn
|
| 14 |
|
| 15 |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 16 |
-
|
| 17 |
-
if ZERO_GPU:
|
| 18 |
-
from huggingface_hub import hf_hub_download
|
| 19 |
-
MODEL_DIR = os.path.join(SCRIPT_DIR, "models")
|
| 20 |
-
os.makedirs(MODEL_DIR, exist_ok=True)
|
| 21 |
-
MODEL_REPO = "anvitax/wham-weights"
|
| 22 |
-
for fname in ["coarse.pth", "c2f.pth", "codec.pth", "wavebeat.pth"]:
|
| 23 |
-
target = os.path.join(MODEL_DIR, fname)
|
| 24 |
-
if not os.path.exists(target):
|
| 25 |
-
print(f"Downloading {fname} from {MODEL_REPO}...")
|
| 26 |
-
hf_hub_download(repo_id=MODEL_REPO, filename=fname, local_dir=MODEL_DIR)
|
| 27 |
-
else:
|
| 28 |
-
print(f"Found {fname}")
|
| 29 |
-
|
| 30 |
os.chdir(SCRIPT_DIR)
|
| 31 |
|
| 32 |
import torch
|
| 33 |
-
device = "
|
| 34 |
sys.argv = ["app.py", "--args.load", "conf/interface.yml", "--Interface.device", device]
|
| 35 |
|
| 36 |
from pathlib import Path
|
|
@@ -56,8 +32,8 @@ conf = argbind.parse_args()
|
|
| 56 |
from torch_pitch_shift import pitch_shift, get_fast_shifts
|
| 57 |
def shift_pitch(signal, interval: int):
|
| 58 |
signal.samples = pitch_shift(
|
| 59 |
-
signal.samples,
|
| 60 |
-
shift=interval,
|
| 61 |
sample_rate=signal.sample_rate
|
| 62 |
)
|
| 63 |
return signal
|
|
@@ -83,7 +59,7 @@ def load_audio(file):
|
|
| 83 |
print(file)
|
| 84 |
filepath = file.name
|
| 85 |
sig = at.AudioSignal.salient_excerpt(
|
| 86 |
-
filepath,
|
| 87 |
duration=interface.coarse.chunk_size_s
|
| 88 |
)
|
| 89 |
sig = interface.preprocess(sig)
|
|
@@ -98,7 +74,6 @@ def load_example_audio():
|
|
| 98 |
return "./assets/example.wav"
|
| 99 |
|
| 100 |
|
| 101 |
-
@gpu
|
| 102 |
def _vamp(
|
| 103 |
_input_audio,
|
| 104 |
_num_steps,
|
|
@@ -167,7 +142,7 @@ def _vamp(
|
|
| 167 |
if _beat_mask_width > 0:
|
| 168 |
beat_mask = interface.make_beat_mask(
|
| 169 |
sig,
|
| 170 |
-
after_beat_s=(_beat_mask_width/1000),
|
| 171 |
mask_upbeats=not _beat_mask_downbeats,
|
| 172 |
)
|
| 173 |
mask = pmask.mask_and(mask, beat_mask)
|
|
@@ -203,29 +178,29 @@ def _vamp(
|
|
| 203 |
|
| 204 |
_seed_val = _seed if _seed > 0 else None
|
| 205 |
zv, mask_z = interface.coarse_vamp(
|
| 206 |
-
z,
|
| 207 |
mask=mask,
|
| 208 |
sampling_steps=_num_steps,
|
| 209 |
mask_temperature=_masktemp*10,
|
| 210 |
sampling_temperature=_sampletemp,
|
| 211 |
-
return_mask=True,
|
| 212 |
-
typical_filtering=_typical_filtering,
|
| 213 |
-
typical_mass=_typical_mass,
|
| 214 |
-
typical_min_tokens=_typical_min_tokens,
|
| 215 |
top_p=_top_p_val,
|
| 216 |
gen_fn=interface.coarse.generate,
|
| 217 |
seed=_seed_val,
|
| 218 |
sample_cutoff=_sample_cutoff,
|
| 219 |
)
|
| 220 |
|
| 221 |
-
if _use_coarse2fine:
|
| 222 |
zv = interface.coarse_to_fine(
|
| 223 |
-
zv,
|
| 224 |
-
mask_temperature=_masktemp*10,
|
| 225 |
sampling_temperature=_sampletemp,
|
| 226 |
mask=mask,
|
| 227 |
sampling_steps=_num_steps,
|
| 228 |
-
sample_cutoff=_sample_cutoff,
|
| 229 |
seed=_seed_val,
|
| 230 |
)
|
| 231 |
|
|
@@ -233,7 +208,7 @@ def _vamp(
|
|
| 233 |
print("done")
|
| 234 |
|
| 235 |
print(f"output loudness is {sig.loudness()}")
|
| 236 |
-
sig = sig.normalize(loudness)
|
| 237 |
print(f"normalized loudness is {sig.loudness()}")
|
| 238 |
|
| 239 |
sig.write(out_dir / "output.wav")
|
|
@@ -280,7 +255,7 @@ def vamp(data):
|
|
| 280 |
|
| 281 |
def api_vamp(data):
|
| 282 |
return _extract_and_call_vamp(data, return_mask=False)
|
| 283 |
-
|
| 284 |
def save_vamp(data):
|
| 285 |
out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
|
| 286 |
out_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -290,7 +265,7 @@ def save_vamp(data):
|
|
| 290 |
|
| 291 |
sig_in.write(out_dir / "input.wav")
|
| 292 |
sig_out.write(out_dir / "output.wav")
|
| 293 |
-
|
| 294 |
_data = {
|
| 295 |
"masktemp": data[masktemp],
|
| 296 |
"sampletemp": data[sampletemp],
|
|
@@ -302,7 +277,7 @@ def save_vamp(data):
|
|
| 302 |
"notes": data[notes_text],
|
| 303 |
"periodic_period": data[periodic_p],
|
| 304 |
"periodic_width": data[periodic_w],
|
| 305 |
-
"n_conditioning_codebooks": data[n_conditioning_codebooks],
|
| 306 |
"use_coarse2fine": data[use_coarse2fine],
|
| 307 |
"stretch_factor": data[stretch_factor],
|
| 308 |
"seed": data[seed],
|
|
@@ -322,7 +297,6 @@ def save_vamp(data):
|
|
| 322 |
return f"saved! your save code is {out_dir.stem}", zip_path
|
| 323 |
|
| 324 |
|
| 325 |
-
@gpu
|
| 326 |
def harp_vamp(_input_audio, _beat_mask_width, _sampletemp):
|
| 327 |
interface.to("cuda")
|
| 328 |
|
|
@@ -338,22 +312,22 @@ def harp_vamp(_input_audio, _beat_mask_width, _sampletemp):
|
|
| 338 |
if _beat_mask_width > 0:
|
| 339 |
beat_mask = interface.make_beat_mask(
|
| 340 |
sig,
|
| 341 |
-
after_beat_s=(_beat_mask_width/1000),
|
| 342 |
)
|
| 343 |
mask = pmask.mask_and(mask, beat_mask)
|
| 344 |
|
| 345 |
# save the mask as a txt file
|
| 346 |
zv, mask_z = interface.coarse_vamp(
|
| 347 |
-
z,
|
| 348 |
mask=mask,
|
| 349 |
sampling_temperature=_sampletemp,
|
| 350 |
-
return_mask=True,
|
| 351 |
gen_fn=interface.coarse.generate,
|
| 352 |
)
|
| 353 |
|
| 354 |
|
| 355 |
zv = interface.coarse_to_fine(
|
| 356 |
-
zv,
|
| 357 |
sampling_temperature=_sampletemp,
|
| 358 |
mask=mask,
|
| 359 |
)
|
|
@@ -371,17 +345,17 @@ with gr.Blocks() as demo:
|
|
| 371 |
with gr.Column():
|
| 372 |
gr.Markdown("# VampNet Audio Vamping")
|
| 373 |
gr.Markdown("""## Description:
|
| 374 |
-
This is a demo of the VampNet, a generative audio model that transforms the input audio based on the chosen settings.
|
| 375 |
-
You can control the extent and nature of variation with a set of manual controls and presets.
|
| 376 |
Use this interface to experiment with different mask settings and explore the audio outputs.
|
| 377 |
""")
|
| 378 |
|
| 379 |
gr.Markdown("""
|
| 380 |
## Instructions:
|
| 381 |
-
1. You can start by uploading some audio, or by loading the example audio.
|
| 382 |
-
2. Choose a preset for the vamp operation, or manually adjust the controls to customize the mask settings.
|
| 383 |
3. Click the "generate (vamp)!!!" button to apply the vamp operation. Listen to the output audio.
|
| 384 |
-
4. Optionally, you can add some notes and save the result.
|
| 385 |
5. You can also use the output as the new input and continue experimenting!
|
| 386 |
""")
|
| 387 |
with gr.Row():
|
|
@@ -396,13 +370,13 @@ with gr.Blocks() as demo:
|
|
| 396 |
|
| 397 |
input_audio = gr.Audio(
|
| 398 |
label="input audio",
|
| 399 |
-
interactive=False,
|
| 400 |
type="filepath",
|
| 401 |
)
|
| 402 |
|
| 403 |
audio_mask = gr.Audio(
|
| 404 |
label="audio mask (listen to this to hear the mask hints)",
|
| 405 |
-
interactive=False,
|
| 406 |
type="filepath",
|
| 407 |
)
|
| 408 |
|
|
@@ -418,7 +392,7 @@ with gr.Blocks() as demo:
|
|
| 418 |
inputs=[manual_audio_upload],
|
| 419 |
outputs=[ input_audio]
|
| 420 |
)
|
| 421 |
-
|
| 422 |
# mask settings
|
| 423 |
with gr.Column():
|
| 424 |
|
|
@@ -429,7 +403,7 @@ with gr.Blocks() as demo:
|
|
| 429 |
"onset_mask_width": 0,
|
| 430 |
"beat_mask_width": 0,
|
| 431 |
"beat_mask_downbeats": False,
|
| 432 |
-
},
|
| 433 |
"slight periodic variation": {
|
| 434 |
"periodic_p": 5,
|
| 435 |
"onset_mask_width": 5,
|
|
@@ -475,7 +449,7 @@ with gr.Blocks() as demo:
|
|
| 475 |
}
|
| 476 |
|
| 477 |
preset = gr.Dropdown(
|
| 478 |
-
label="preset",
|
| 479 |
choices=list(presets.keys()),
|
| 480 |
value="strong periodic variation",
|
| 481 |
)
|
|
@@ -485,9 +459,9 @@ with gr.Blocks() as demo:
|
|
| 485 |
periodic_p = gr.Slider(
|
| 486 |
label="periodic prompt (0 - unconditional, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
|
| 487 |
minimum=0,
|
| 488 |
-
maximum=128,
|
| 489 |
step=1,
|
| 490 |
-
value=3,
|
| 491 |
)
|
| 492 |
|
| 493 |
|
|
@@ -506,7 +480,7 @@ with gr.Blocks() as demo:
|
|
| 506 |
value=0,
|
| 507 |
)
|
| 508 |
beat_mask_downbeats = gr.Checkbox(
|
| 509 |
-
label="beat mask downbeats only?",
|
| 510 |
value=False
|
| 511 |
)
|
| 512 |
|
|
@@ -540,7 +514,7 @@ with gr.Blocks() as demo:
|
|
| 540 |
value=1,
|
| 541 |
)
|
| 542 |
n_conditioning_codebooks = gr.Number(
|
| 543 |
-
label="number of conditioning codebooks. probably 0",
|
| 544 |
value=0,
|
| 545 |
precision=0,
|
| 546 |
)
|
|
@@ -548,14 +522,14 @@ with gr.Blocks() as demo:
|
|
| 548 |
stretch_factor = gr.Slider(
|
| 549 |
label="time stretch factor",
|
| 550 |
minimum=0,
|
| 551 |
-
maximum=64,
|
| 552 |
step=1,
|
| 553 |
-
value=1,
|
| 554 |
)
|
| 555 |
|
| 556 |
preset_outputs = {
|
| 557 |
-
periodic_p,
|
| 558 |
-
onset_mask_width,
|
| 559 |
beat_mask_width,
|
| 560 |
beat_mask_downbeats,
|
| 561 |
}
|
|
@@ -594,10 +568,10 @@ with gr.Blocks() as demo:
|
|
| 594 |
label="sample temperature",
|
| 595 |
minimum=0.1,
|
| 596 |
maximum=10.0,
|
| 597 |
-
value=1.0,
|
| 598 |
step=0.001
|
| 599 |
)
|
| 600 |
-
|
| 601 |
|
| 602 |
|
| 603 |
with gr.Accordion("sampling settings", open=False):
|
|
@@ -611,7 +585,7 @@ with gr.Blocks() as demo:
|
|
| 611 |
label="typical filtering ",
|
| 612 |
value=False
|
| 613 |
)
|
| 614 |
-
typical_mass = gr.Slider(
|
| 615 |
label="typical mass (should probably stay between 0.1 and 0.5)",
|
| 616 |
minimum=0.01,
|
| 617 |
maximum=0.99,
|
|
@@ -628,13 +602,13 @@ with gr.Blocks() as demo:
|
|
| 628 |
label="sample cutoff",
|
| 629 |
minimum=0.0,
|
| 630 |
maximum=1.0,
|
| 631 |
-
value=0.5,
|
| 632 |
step=0.01
|
| 633 |
)
|
| 634 |
|
| 635 |
use_coarse2fine = gr.Checkbox(
|
| 636 |
label="use coarse2fine",
|
| 637 |
-
value=True,
|
| 638 |
visible=False
|
| 639 |
)
|
| 640 |
|
|
@@ -667,9 +641,9 @@ with gr.Blocks() as demo:
|
|
| 667 |
with gr.Column():
|
| 668 |
|
| 669 |
# lora_choice = gr.Dropdown(
|
| 670 |
-
# label="lora choice",
|
| 671 |
# choices=list(loras.keys()),
|
| 672 |
-
# value=LORA_NONE,
|
| 673 |
# visible=False
|
| 674 |
# )
|
| 675 |
|
|
@@ -681,7 +655,7 @@ with gr.Blocks() as demo:
|
|
| 681 |
)
|
| 682 |
|
| 683 |
notes_text = gr.Textbox(
|
| 684 |
-
label="type any notes about the generated audio here",
|
| 685 |
value="",
|
| 686 |
interactive=True
|
| 687 |
)
|
|
@@ -691,48 +665,48 @@ with gr.Blocks() as demo:
|
|
| 691 |
interactive=False
|
| 692 |
)
|
| 693 |
use_as_input_button = gr.Button("use output as input")
|
| 694 |
-
|
| 695 |
thank_you = gr.Markdown("")
|
| 696 |
|
| 697 |
|
| 698 |
_inputs = {
|
| 699 |
-
input_audio,
|
| 700 |
num_steps,
|
| 701 |
masktemp,
|
| 702 |
sampletemp,
|
| 703 |
top_p,
|
| 704 |
-
prefix_s, suffix_s,
|
| 705 |
-
rand_mask_intensity,
|
| 706 |
periodic_p, periodic_w,
|
| 707 |
-
n_conditioning_codebooks,
|
| 708 |
dropout,
|
| 709 |
-
use_coarse2fine,
|
| 710 |
-
stretch_factor,
|
| 711 |
-
onset_mask_width,
|
| 712 |
typical_filtering,
|
| 713 |
typical_mass,
|
| 714 |
typical_min_tokens,
|
| 715 |
beat_mask_width,
|
| 716 |
beat_mask_downbeats,
|
| 717 |
-
seed,
|
| 718 |
# lora_choice,
|
| 719 |
n_mask_codebooks,
|
| 720 |
-
pitch_shift_amt,
|
| 721 |
sample_cutoff
|
| 722 |
}
|
| 723 |
-
|
| 724 |
# connect widgets
|
| 725 |
vamp_button.click(
|
| 726 |
fn=vamp,
|
| 727 |
inputs=_inputs,
|
| 728 |
-
outputs=[output_audio, audio_mask],
|
| 729 |
)
|
| 730 |
|
| 731 |
api_vamp_button = gr.Button("api vamp", visible=False)
|
| 732 |
api_vamp_button.click(
|
| 733 |
fn=api_vamp,
|
| 734 |
-
inputs=_inputs,
|
| 735 |
-
outputs=[output_audio],
|
| 736 |
api_name="vamp"
|
| 737 |
)
|
| 738 |
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
os.chdir(SCRIPT_DIR)
|
| 7 |
|
| 8 |
import torch
|
| 9 |
+
device = "cuda" if torch.cuda.is_available()
|
| 10 |
sys.argv = ["app.py", "--args.load", "conf/interface.yml", "--Interface.device", device]
|
| 11 |
|
| 12 |
from pathlib import Path
|
|
|
|
| 32 |
from torch_pitch_shift import pitch_shift, get_fast_shifts
|
| 33 |
def shift_pitch(signal, interval: int):
|
| 34 |
signal.samples = pitch_shift(
|
| 35 |
+
signal.samples,
|
| 36 |
+
shift=interval,
|
| 37 |
sample_rate=signal.sample_rate
|
| 38 |
)
|
| 39 |
return signal
|
|
|
|
| 59 |
print(file)
|
| 60 |
filepath = file.name
|
| 61 |
sig = at.AudioSignal.salient_excerpt(
|
| 62 |
+
filepath,
|
| 63 |
duration=interface.coarse.chunk_size_s
|
| 64 |
)
|
| 65 |
sig = interface.preprocess(sig)
|
|
|
|
| 74 |
return "./assets/example.wav"
|
| 75 |
|
| 76 |
|
|
|
|
| 77 |
def _vamp(
|
| 78 |
_input_audio,
|
| 79 |
_num_steps,
|
|
|
|
| 142 |
if _beat_mask_width > 0:
|
| 143 |
beat_mask = interface.make_beat_mask(
|
| 144 |
sig,
|
| 145 |
+
after_beat_s=(_beat_mask_width/1000),
|
| 146 |
mask_upbeats=not _beat_mask_downbeats,
|
| 147 |
)
|
| 148 |
mask = pmask.mask_and(mask, beat_mask)
|
|
|
|
| 178 |
|
| 179 |
_seed_val = _seed if _seed > 0 else None
|
| 180 |
zv, mask_z = interface.coarse_vamp(
|
| 181 |
+
z,
|
| 182 |
mask=mask,
|
| 183 |
sampling_steps=_num_steps,
|
| 184 |
mask_temperature=_masktemp*10,
|
| 185 |
sampling_temperature=_sampletemp,
|
| 186 |
+
return_mask=True,
|
| 187 |
+
typical_filtering=_typical_filtering,
|
| 188 |
+
typical_mass=_typical_mass,
|
| 189 |
+
typical_min_tokens=_typical_min_tokens,
|
| 190 |
top_p=_top_p_val,
|
| 191 |
gen_fn=interface.coarse.generate,
|
| 192 |
seed=_seed_val,
|
| 193 |
sample_cutoff=_sample_cutoff,
|
| 194 |
)
|
| 195 |
|
| 196 |
+
if _use_coarse2fine:
|
| 197 |
zv = interface.coarse_to_fine(
|
| 198 |
+
zv,
|
| 199 |
+
mask_temperature=_masktemp*10,
|
| 200 |
sampling_temperature=_sampletemp,
|
| 201 |
mask=mask,
|
| 202 |
sampling_steps=_num_steps,
|
| 203 |
+
sample_cutoff=_sample_cutoff,
|
| 204 |
seed=_seed_val,
|
| 205 |
)
|
| 206 |
|
|
|
|
| 208 |
print("done")
|
| 209 |
|
| 210 |
print(f"output loudness is {sig.loudness()}")
|
| 211 |
+
sig = sig.normalize(loudness)
|
| 212 |
print(f"normalized loudness is {sig.loudness()}")
|
| 213 |
|
| 214 |
sig.write(out_dir / "output.wav")
|
|
|
|
| 255 |
|
| 256 |
def api_vamp(data):
|
| 257 |
return _extract_and_call_vamp(data, return_mask=False)
|
| 258 |
+
|
| 259 |
def save_vamp(data):
|
| 260 |
out_dir = OUT_DIR / "saved" / str(uuid.uuid4())
|
| 261 |
out_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
| 265 |
|
| 266 |
sig_in.write(out_dir / "input.wav")
|
| 267 |
sig_out.write(out_dir / "output.wav")
|
| 268 |
+
|
| 269 |
_data = {
|
| 270 |
"masktemp": data[masktemp],
|
| 271 |
"sampletemp": data[sampletemp],
|
|
|
|
| 277 |
"notes": data[notes_text],
|
| 278 |
"periodic_period": data[periodic_p],
|
| 279 |
"periodic_width": data[periodic_w],
|
| 280 |
+
"n_conditioning_codebooks": data[n_conditioning_codebooks],
|
| 281 |
"use_coarse2fine": data[use_coarse2fine],
|
| 282 |
"stretch_factor": data[stretch_factor],
|
| 283 |
"seed": data[seed],
|
|
|
|
| 297 |
return f"saved! your save code is {out_dir.stem}", zip_path
|
| 298 |
|
| 299 |
|
|
|
|
| 300 |
def harp_vamp(_input_audio, _beat_mask_width, _sampletemp):
|
| 301 |
interface.to("cuda")
|
| 302 |
|
|
|
|
| 312 |
if _beat_mask_width > 0:
|
| 313 |
beat_mask = interface.make_beat_mask(
|
| 314 |
sig,
|
| 315 |
+
after_beat_s=(_beat_mask_width/1000),
|
| 316 |
)
|
| 317 |
mask = pmask.mask_and(mask, beat_mask)
|
| 318 |
|
| 319 |
# save the mask as a txt file
|
| 320 |
zv, mask_z = interface.coarse_vamp(
|
| 321 |
+
z,
|
| 322 |
mask=mask,
|
| 323 |
sampling_temperature=_sampletemp,
|
| 324 |
+
return_mask=True,
|
| 325 |
gen_fn=interface.coarse.generate,
|
| 326 |
)
|
| 327 |
|
| 328 |
|
| 329 |
zv = interface.coarse_to_fine(
|
| 330 |
+
zv,
|
| 331 |
sampling_temperature=_sampletemp,
|
| 332 |
mask=mask,
|
| 333 |
)
|
|
|
|
| 345 |
with gr.Column():
|
| 346 |
gr.Markdown("# VampNet Audio Vamping")
|
| 347 |
gr.Markdown("""## Description:
|
| 348 |
+
This is a demo of the VampNet, a generative audio model that transforms the input audio based on the chosen settings.
|
| 349 |
+
You can control the extent and nature of variation with a set of manual controls and presets.
|
| 350 |
Use this interface to experiment with different mask settings and explore the audio outputs.
|
| 351 |
""")
|
| 352 |
|
| 353 |
gr.Markdown("""
|
| 354 |
## Instructions:
|
| 355 |
+
1. You can start by uploading some audio, or by loading the example audio.
|
| 356 |
+
2. Choose a preset for the vamp operation, or manually adjust the controls to customize the mask settings.
|
| 357 |
3. Click the "generate (vamp)!!!" button to apply the vamp operation. Listen to the output audio.
|
| 358 |
+
4. Optionally, you can add some notes and save the result.
|
| 359 |
5. You can also use the output as the new input and continue experimenting!
|
| 360 |
""")
|
| 361 |
with gr.Row():
|
|
|
|
| 370 |
|
| 371 |
input_audio = gr.Audio(
|
| 372 |
label="input audio",
|
| 373 |
+
interactive=False,
|
| 374 |
type="filepath",
|
| 375 |
)
|
| 376 |
|
| 377 |
audio_mask = gr.Audio(
|
| 378 |
label="audio mask (listen to this to hear the mask hints)",
|
| 379 |
+
interactive=False,
|
| 380 |
type="filepath",
|
| 381 |
)
|
| 382 |
|
|
|
|
| 392 |
inputs=[manual_audio_upload],
|
| 393 |
outputs=[ input_audio]
|
| 394 |
)
|
| 395 |
+
|
| 396 |
# mask settings
|
| 397 |
with gr.Column():
|
| 398 |
|
|
|
|
| 403 |
"onset_mask_width": 0,
|
| 404 |
"beat_mask_width": 0,
|
| 405 |
"beat_mask_downbeats": False,
|
| 406 |
+
},
|
| 407 |
"slight periodic variation": {
|
| 408 |
"periodic_p": 5,
|
| 409 |
"onset_mask_width": 5,
|
|
|
|
| 449 |
}
|
| 450 |
|
| 451 |
preset = gr.Dropdown(
|
| 452 |
+
label="preset",
|
| 453 |
choices=list(presets.keys()),
|
| 454 |
value="strong periodic variation",
|
| 455 |
)
|
|
|
|
| 459 |
periodic_p = gr.Slider(
|
| 460 |
label="periodic prompt (0 - unconditional, 2 - lots of hints, 8 - a couple of hints, 16 - occasional hint, 32 - very occasional hint, etc)",
|
| 461 |
minimum=0,
|
| 462 |
+
maximum=128,
|
| 463 |
step=1,
|
| 464 |
+
value=3,
|
| 465 |
)
|
| 466 |
|
| 467 |
|
|
|
|
| 480 |
value=0,
|
| 481 |
)
|
| 482 |
beat_mask_downbeats = gr.Checkbox(
|
| 483 |
+
label="beat mask downbeats only?",
|
| 484 |
value=False
|
| 485 |
)
|
| 486 |
|
|
|
|
| 514 |
value=1,
|
| 515 |
)
|
| 516 |
n_conditioning_codebooks = gr.Number(
|
| 517 |
+
label="number of conditioning codebooks. probably 0",
|
| 518 |
value=0,
|
| 519 |
precision=0,
|
| 520 |
)
|
|
|
|
| 522 |
stretch_factor = gr.Slider(
|
| 523 |
label="time stretch factor",
|
| 524 |
minimum=0,
|
| 525 |
+
maximum=64,
|
| 526 |
step=1,
|
| 527 |
+
value=1,
|
| 528 |
)
|
| 529 |
|
| 530 |
preset_outputs = {
|
| 531 |
+
periodic_p,
|
| 532 |
+
onset_mask_width,
|
| 533 |
beat_mask_width,
|
| 534 |
beat_mask_downbeats,
|
| 535 |
}
|
|
|
|
| 568 |
label="sample temperature",
|
| 569 |
minimum=0.1,
|
| 570 |
maximum=10.0,
|
| 571 |
+
value=1.0,
|
| 572 |
step=0.001
|
| 573 |
)
|
| 574 |
+
|
| 575 |
|
| 576 |
|
| 577 |
with gr.Accordion("sampling settings", open=False):
|
|
|
|
| 585 |
label="typical filtering ",
|
| 586 |
value=False
|
| 587 |
)
|
| 588 |
+
typical_mass = gr.Slider(
|
| 589 |
label="typical mass (should probably stay between 0.1 and 0.5)",
|
| 590 |
minimum=0.01,
|
| 591 |
maximum=0.99,
|
|
|
|
| 602 |
label="sample cutoff",
|
| 603 |
minimum=0.0,
|
| 604 |
maximum=1.0,
|
| 605 |
+
value=0.5,
|
| 606 |
step=0.01
|
| 607 |
)
|
| 608 |
|
| 609 |
use_coarse2fine = gr.Checkbox(
|
| 610 |
label="use coarse2fine",
|
| 611 |
+
value=True,
|
| 612 |
visible=False
|
| 613 |
)
|
| 614 |
|
|
|
|
| 641 |
with gr.Column():
|
| 642 |
|
| 643 |
# lora_choice = gr.Dropdown(
|
| 644 |
+
# label="lora choice",
|
| 645 |
# choices=list(loras.keys()),
|
| 646 |
+
# value=LORA_NONE,
|
| 647 |
# visible=False
|
| 648 |
# )
|
| 649 |
|
|
|
|
| 655 |
)
|
| 656 |
|
| 657 |
notes_text = gr.Textbox(
|
| 658 |
+
label="type any notes about the generated audio here",
|
| 659 |
value="",
|
| 660 |
interactive=True
|
| 661 |
)
|
|
|
|
| 665 |
interactive=False
|
| 666 |
)
|
| 667 |
use_as_input_button = gr.Button("use output as input")
|
| 668 |
+
|
| 669 |
thank_you = gr.Markdown("")
|
| 670 |
|
| 671 |
|
| 672 |
_inputs = {
|
| 673 |
+
input_audio,
|
| 674 |
num_steps,
|
| 675 |
masktemp,
|
| 676 |
sampletemp,
|
| 677 |
top_p,
|
| 678 |
+
prefix_s, suffix_s,
|
| 679 |
+
rand_mask_intensity,
|
| 680 |
periodic_p, periodic_w,
|
| 681 |
+
n_conditioning_codebooks,
|
| 682 |
dropout,
|
| 683 |
+
use_coarse2fine,
|
| 684 |
+
stretch_factor,
|
| 685 |
+
onset_mask_width,
|
| 686 |
typical_filtering,
|
| 687 |
typical_mass,
|
| 688 |
typical_min_tokens,
|
| 689 |
beat_mask_width,
|
| 690 |
beat_mask_downbeats,
|
| 691 |
+
seed,
|
| 692 |
# lora_choice,
|
| 693 |
n_mask_codebooks,
|
| 694 |
+
pitch_shift_amt,
|
| 695 |
sample_cutoff
|
| 696 |
}
|
| 697 |
+
|
| 698 |
# connect widgets
|
| 699 |
vamp_button.click(
|
| 700 |
fn=vamp,
|
| 701 |
inputs=_inputs,
|
| 702 |
+
outputs=[output_audio, audio_mask],
|
| 703 |
)
|
| 704 |
|
| 705 |
api_vamp_button = gr.Button("api vamp", visible=False)
|
| 706 |
api_vamp_button.click(
|
| 707 |
fn=api_vamp,
|
| 708 |
+
inputs=_inputs,
|
| 709 |
+
outputs=[output_audio],
|
| 710 |
api_name="vamp"
|
| 711 |
)
|
| 712 |
|