Spaces:
Running
Running
Initial upload: LocalVQE demo Space
Browse files- .gitattributes +10 -0
- app.py +40 -18
- examples/dt_mic.wav +3 -0
- examples/dt_ref.wav +3 -0
- examples/fe_st2_mic.wav +3 -0
- examples/fe_st2_ref.wav +3 -0
- examples/fe_st_mic.wav +3 -0
- examples/fe_st_ref.wav +3 -0
- examples/ne_st_clean_mic.wav +3 -0
- examples/ne_st_clean_ref.wav +3 -0
- examples/ne_st_noisy_mic.wav +3 -0
- examples/ne_st_noisy_ref.wav +3 -0
.gitattributes
CHANGED
|
@@ -43,3 +43,13 @@ examples/fe_st2_mic.flac filter=lfs diff=lfs merge=lfs -text
|
|
| 43 |
examples/fe_st2_ref.flac filter=lfs diff=lfs merge=lfs -text
|
| 44 |
examples/ne_st_clean_mic.flac filter=lfs diff=lfs merge=lfs -text
|
| 45 |
examples/ne_st_clean_ref.flac filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
examples/fe_st2_ref.flac filter=lfs diff=lfs merge=lfs -text
|
| 44 |
examples/ne_st_clean_mic.flac filter=lfs diff=lfs merge=lfs -text
|
| 45 |
examples/ne_st_clean_ref.flac filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
examples/dt_mic.wav filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
examples/dt_ref.wav filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
examples/fe_st2_mic.wav filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
examples/fe_st2_ref.wav filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
examples/fe_st_mic.wav filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
examples/fe_st_ref.wav filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
examples/ne_st_clean_mic.wav filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
examples/ne_st_clean_ref.wav filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
examples/ne_st_noisy_mic.wav filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
examples/ne_st_noisy_ref.wav filter=lfs diff=lfs merge=lfs -text
|
app.py
CHANGED
|
@@ -5,6 +5,7 @@ huggingface.co/LocalAI-io/LocalVQE. Set the env var
|
|
| 5 |
`LOCALVQE_LOCAL_CKPT=/path/to/checkpoint.pt` to load a local file
|
| 6 |
instead — useful for auditioning new training runs.
|
| 7 |
"""
|
|
|
|
| 8 |
import os
|
| 9 |
from pathlib import Path
|
| 10 |
|
|
@@ -22,14 +23,25 @@ CKPT_FILE = "localvqe-v1-1.3M.pt"
|
|
| 22 |
EXAMPLES_DIR = Path(__file__).resolve().parent / "examples"
|
| 23 |
|
| 24 |
|
| 25 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
local_override = os.environ.get("LOCALVQE_LOCAL_CKPT")
|
| 27 |
if local_override:
|
| 28 |
ckpt_path = local_override
|
|
|
|
| 29 |
print(f"Loading local checkpoint: {ckpt_path}")
|
| 30 |
else:
|
| 31 |
from huggingface_hub import hf_hub_download
|
| 32 |
ckpt_path = hf_hub_download(repo_id=REPO_ID, filename=CKPT_FILE)
|
|
|
|
|
|
|
| 33 |
cfg = Config()
|
| 34 |
peek = torch.load(ckpt_path, map_location="cpu", weights_only=False)
|
| 35 |
apply_ckpt_model_config(peek, cfg)
|
|
@@ -43,11 +55,12 @@ def _build_model() -> LocalVQE:
|
|
| 43 |
model.align.fold_temperature()
|
| 44 |
model.eval()
|
| 45 |
n_params = sum(p.numel() for p in model.parameters())
|
| 46 |
-
|
| 47 |
-
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
-
MODEL = _build_model()
|
| 51 |
|
| 52 |
|
| 53 |
def _load_mono_16k(path: str) -> np.ndarray:
|
|
@@ -93,24 +106,24 @@ def enhance(mic_path: str, ref_path: str) -> tuple[int, np.ndarray]:
|
|
| 93 |
|
| 94 |
EXAMPLES = [
|
| 95 |
[
|
| 96 |
-
str(EXAMPLES_DIR / "ne_st_noisy_mic.
|
| 97 |
-
str(EXAMPLES_DIR / "ne_st_noisy_ref.
|
| 98 |
],
|
| 99 |
[
|
| 100 |
-
str(EXAMPLES_DIR / "ne_st_clean_mic.
|
| 101 |
-
str(EXAMPLES_DIR / "ne_st_clean_ref.
|
| 102 |
],
|
| 103 |
[
|
| 104 |
-
str(EXAMPLES_DIR / "fe_st_mic.
|
| 105 |
-
str(EXAMPLES_DIR / "fe_st_ref.
|
| 106 |
],
|
| 107 |
[
|
| 108 |
-
str(EXAMPLES_DIR / "fe_st2_mic.
|
| 109 |
-
str(EXAMPLES_DIR / "fe_st2_ref.
|
| 110 |
],
|
| 111 |
[
|
| 112 |
-
str(EXAMPLES_DIR / "dt_mic.
|
| 113 |
-
str(EXAMPLES_DIR / "dt_ref.
|
| 114 |
],
|
| 115 |
]
|
| 116 |
|
|
@@ -130,8 +143,10 @@ Provide two inputs:
|
|
| 130 |
|
| 131 |
Try the bundled examples first — they cover heavy and light
|
| 132 |
near-end noise (NE-ST mixed with DNS5 background at 5 dB and 20 dB
|
| 133 |
-
SNR),
|
| 134 |
-
|
|
|
|
|
|
|
| 135 |
|
| 136 |
Weights: [LocalAI-io/LocalVQE](https://huggingface.co/LocalAI-io/LocalVQE) ·
|
| 137 |
Code: [github.com/richiejp/LocalVQE](https://github.com/richiejp/LocalVQE)
|
|
@@ -152,12 +167,19 @@ with gr.Blocks(title="LocalVQE Demo") as demo:
|
|
| 152 |
label=(
|
| 153 |
"Examples — top to bottom: near-end + heavy noise (5 dB SNR, "
|
| 154 |
"pure NS), near-end + light noise (20 dB SNR, NS preserving "
|
| 155 |
-
"clean speech),
|
| 156 |
-
"
|
|
|
|
| 157 |
),
|
| 158 |
)
|
| 159 |
|
| 160 |
btn.click(enhance, inputs=[mic_in, ref_in], outputs=out)
|
| 161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
if __name__ == "__main__":
|
| 163 |
demo.launch()
|
|
|
|
| 5 |
`LOCALVQE_LOCAL_CKPT=/path/to/checkpoint.pt` to load a local file
|
| 6 |
instead — useful for auditioning new training runs.
|
| 7 |
"""
|
| 8 |
+
import hashlib
|
| 9 |
import os
|
| 10 |
from pathlib import Path
|
| 11 |
|
|
|
|
| 23 |
EXAMPLES_DIR = Path(__file__).resolve().parent / "examples"
|
| 24 |
|
| 25 |
|
| 26 |
+
def _sha256(path: str) -> str:
|
| 27 |
+
h = hashlib.sha256()
|
| 28 |
+
with open(path, "rb") as f:
|
| 29 |
+
for chunk in iter(lambda: f.read(1 << 20), b""):
|
| 30 |
+
h.update(chunk)
|
| 31 |
+
return h.hexdigest()
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _build_model() -> tuple[LocalVQE, dict]:
|
| 35 |
local_override = os.environ.get("LOCALVQE_LOCAL_CKPT")
|
| 36 |
if local_override:
|
| 37 |
ckpt_path = local_override
|
| 38 |
+
source = f"local:{ckpt_path}"
|
| 39 |
print(f"Loading local checkpoint: {ckpt_path}")
|
| 40 |
else:
|
| 41 |
from huggingface_hub import hf_hub_download
|
| 42 |
ckpt_path = hf_hub_download(repo_id=REPO_ID, filename=CKPT_FILE)
|
| 43 |
+
source = f"hf:{REPO_ID}/{CKPT_FILE}"
|
| 44 |
+
sha = _sha256(ckpt_path)
|
| 45 |
cfg = Config()
|
| 46 |
peek = torch.load(ckpt_path, map_location="cpu", weights_only=False)
|
| 47 |
apply_ckpt_model_config(peek, cfg)
|
|
|
|
| 55 |
model.align.fold_temperature()
|
| 56 |
model.eval()
|
| 57 |
n_params = sum(p.numel() for p in model.parameters())
|
| 58 |
+
info = {"source": source, "sha256": sha, "n_params": n_params}
|
| 59 |
+
print(f"LocalVQE loaded: {n_params:,} params sha256={sha} from {source}")
|
| 60 |
+
return model, info
|
| 61 |
|
| 62 |
|
| 63 |
+
MODEL, MODEL_INFO = _build_model()
|
| 64 |
|
| 65 |
|
| 66 |
def _load_mono_16k(path: str) -> np.ndarray:
|
|
|
|
| 106 |
|
| 107 |
EXAMPLES = [
|
| 108 |
[
|
| 109 |
+
str(EXAMPLES_DIR / "ne_st_noisy_mic.wav"),
|
| 110 |
+
str(EXAMPLES_DIR / "ne_st_noisy_ref.wav"),
|
| 111 |
],
|
| 112 |
[
|
| 113 |
+
str(EXAMPLES_DIR / "ne_st_clean_mic.wav"),
|
| 114 |
+
str(EXAMPLES_DIR / "ne_st_clean_ref.wav"),
|
| 115 |
],
|
| 116 |
[
|
| 117 |
+
str(EXAMPLES_DIR / "fe_st_mic.wav"),
|
| 118 |
+
str(EXAMPLES_DIR / "fe_st_ref.wav"),
|
| 119 |
],
|
| 120 |
[
|
| 121 |
+
str(EXAMPLES_DIR / "fe_st2_mic.wav"),
|
| 122 |
+
str(EXAMPLES_DIR / "fe_st2_ref.wav"),
|
| 123 |
],
|
| 124 |
[
|
| 125 |
+
str(EXAMPLES_DIR / "dt_mic.wav"),
|
| 126 |
+
str(EXAMPLES_DIR / "dt_ref.wav"),
|
| 127 |
],
|
| 128 |
]
|
| 129 |
|
|
|
|
| 143 |
|
| 144 |
Try the bundled examples first — they cover heavy and light
|
| 145 |
near-end noise (NE-ST mixed with DNS5 background at 5 dB and 20 dB
|
| 146 |
+
SNR), a clean far-end single-talk clip, a far-end clip with some
|
| 147 |
+
near-end overlap (mislabelled in the source corpus, but a useful
|
| 148 |
+
test of AEC + near-end preservation together), and a double-talk
|
| 149 |
+
clip — all from the ICASSP 2022 AEC Challenge blind set.
|
| 150 |
|
| 151 |
Weights: [LocalAI-io/LocalVQE](https://huggingface.co/LocalAI-io/LocalVQE) ·
|
| 152 |
Code: [github.com/richiejp/LocalVQE](https://github.com/richiejp/LocalVQE)
|
|
|
|
| 167 |
label=(
|
| 168 |
"Examples — top to bottom: near-end + heavy noise (5 dB SNR, "
|
| 169 |
"pure NS), near-end + light noise (20 dB SNR, NS preserving "
|
| 170 |
+
"clean speech), far-end single-talk (pure AEC), far-end with "
|
| 171 |
+
"brief near-end overlap (AEC while preserving NE), double-talk "
|
| 172 |
+
"(AEC while near-end is also talking)."
|
| 173 |
),
|
| 174 |
)
|
| 175 |
|
| 176 |
btn.click(enhance, inputs=[mic_in, ref_in], outputs=out)
|
| 177 |
|
| 178 |
+
gr.Markdown(
|
| 179 |
+
f"<sub>Loaded: <code>{MODEL_INFO['source']}</code> · "
|
| 180 |
+
f"sha256 <code>{MODEL_INFO['sha256'][:16]}…</code> · "
|
| 181 |
+
f"{MODEL_INFO['n_params']:,} params</sub>"
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
if __name__ == "__main__":
|
| 185 |
demo.launch()
|
examples/dt_mic.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8d0c15e56a4c7f387847451952123e8f23161520e81c1ba8878c922f11592a62
|
| 3 |
+
size 320044
|
examples/dt_ref.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e931d49e2802c1d7b15750caff1455495cd1dc40d27cb2fe51a8c58cbebe60fe
|
| 3 |
+
size 320044
|
examples/fe_st2_mic.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66507ac731a84abd30d4c2cbc7701d16464d705067faef7f1c926049230483c6
|
| 3 |
+
size 320044
|
examples/fe_st2_ref.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e1cef5940243d47e2ac450d7781de0a3f0a805fd8843b77830fb7084e662b71
|
| 3 |
+
size 320044
|
examples/fe_st_mic.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:564b26602ebfb64aa55a7646a2e3ec4c76801e8e602cfb94e07e7f0beed60f0c
|
| 3 |
+
size 320044
|
examples/fe_st_ref.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:390d63e7ea4741e958f4af4eeb1d729c4b58ccf4acce26dc82273f3c12ead0e5
|
| 3 |
+
size 320044
|
examples/ne_st_clean_mic.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c8392be1a0baa7b8f308dab157209c7cf52f22fefe9de4c0e8a07771b5acefad
|
| 3 |
+
size 320044
|
examples/ne_st_clean_ref.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:16295fb4145e19a50076b0e24ef051186a01a28276532c549416fff9259888f8
|
| 3 |
+
size 320044
|
examples/ne_st_noisy_mic.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d3fb1b800eddd790a1e033a5d553044173cd42e295a8a033a8d97de4e077445
|
| 3 |
+
size 320044
|
examples/ne_st_noisy_ref.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b1907eaf1a27fdc96774ac001fe11766142aadddbe36a95e563be78b708e59d
|
| 3 |
+
size 320044
|