karim23657 commited on
Commit
8c8f185
·
verified ·
1 Parent(s): f2166a4

Upload folder using huggingface_hub

Browse files
Files changed (11) hide show
  1. .gitattributes +1 -0
  2. .gitignore +9 -0
  3. LICENSE +21 -0
  4. README.md +7 -5
  5. app.py +92 -0
  6. denoiser.onnx +3 -0
  7. denoiser.py +75 -0
  8. denoiser_output.wav +3 -0
  9. packages.txt +1 -0
  10. pyproject.toml +6 -0
  11. requirements.txt +6 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ denoiser_output.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ /data
2
+ /runs
3
+ /scripts
4
+ /dist
5
+ /build
6
+ /*.egg-info
7
+ /flagged
8
+ version.py
9
+ __pycache__
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Resemble AI
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,15 @@
1
  ---
2
- title: Resemble Denoise Onnx
3
  emoji: 🚀
4
  colorFrom: red
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 6.1.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: Denoise audio files on cpu
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
1
  ---
2
+ title: Resemble Enhance
3
  emoji: 🚀
4
  colorFrom: red
5
+ colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 4.8.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # Resemble Enhance
14
+
15
+ Resemble Enhance is an AI-powered tool that aims to improve the overall quality of speech by performing denoising and enhancement. It consists of two modules: a denoiser, which separates speech from a noisy audio, and an enhancer, which further boosts the perceptual audio quality by restoring audio distortions and extending the audio bandwidth. The two models are trained on high-quality 44.1kHz speech data that guarantees the enhancement of your speech with high quality.
app.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from functools import partial
3
+
4
+ import gradio as gr
5
+
6
+ import time
7
+ import numpy as np
8
+ from denoiser import run
9
+ import onnxruntime
10
+ import librosa
11
+ import scipy
12
+
13
+ opts = onnxruntime.SessionOptions()
14
+ opts.inter_op_num_threads = 4
15
+ opts.intra_op_num_threads = 4
16
+ opts.log_severity_level = 4
17
+
18
+ session = onnxruntime.InferenceSession(
19
+ 'denoiser.onnx',
20
+ providers=["CPUExecutionProvider"],
21
+ #providers=["ROCMExecutionProvider"],
22
+ #providers=["DnnlExecutionProvider"],
23
+ sess_options=opts,
24
+ )
25
+
26
+ def _fn(path, solver, nfe, tau, denoising, unlimited):
27
+ if path is None:
28
+ gr.Warning("Please upload an audio file.")
29
+ return None, None
30
+
31
+ wav, sr = librosa.load(path, mono=True)
32
+ start = time.time()
33
+ wav_onnx, new_sr = run(session, wav, sr, batch_process_chunks=False)
34
+ print(f'Ran in {time.time() - start}s')
35
+ # scipy.io.wavfile.write('denoiser_output.wav', new_sr, wav_onnx)
36
+
37
+ wav1 = wav1.cpu().numpy()
38
+ wav2 = wav2.cpu().numpy()
39
+
40
+ return (new_sr, wav_onnx)
41
+
42
+
43
+ def main():
44
+ parser = argparse.ArgumentParser()
45
+ parser.add_argument("--unlimited", action="store_true")
46
+ args = parser.parse_args()
47
+
48
+ inputs: list = [
49
+ gr.Audio(type="filepath", label="Input Audio"),
50
+ gr.Dropdown(
51
+ choices=["Midpoint", "RK4", "Euler"],
52
+ value="Midpoint",
53
+ label="CFM ODE Solver (Midpoint is recommended)",
54
+ ),
55
+ gr.Slider(
56
+ minimum=1,
57
+ maximum=256,
58
+ value=64,
59
+ step=1,
60
+ label="CFM Number of Function Evaluations (higher values in general yield better quality but may be slower)",
61
+ ),
62
+ gr.Slider(
63
+ minimum=0,
64
+ maximum=1,
65
+ value=0.5,
66
+ step=0.01,
67
+ label="CFM Prior Temperature (higher values can improve quality but can reduce stability)",
68
+ ),
69
+ gr.Checkbox(
70
+ value=False,
71
+ label="Denoise Before Enhancement (tick if your audio contains heavy background noise)",
72
+ ),
73
+ ]
74
+
75
+ outputs: list = [
76
+ gr.Audio(label="Output Denoised Audio"),
77
+ # gr.Audio(label="Output Enhanced Audio"),
78
+ ]
79
+
80
+ interface = gr.Interface(
81
+ fn=partial(_fn, unlimited=args.unlimited),
82
+ title="Resemble Enhance",
83
+ description="AI-driven audio enhancement for your audio files, powered by Resemble AI.",
84
+ inputs=inputs,
85
+ outputs=outputs,
86
+ )
87
+
88
+ interface.launch()
89
+
90
+
91
+ if __name__ == "__main__":
92
+ main()
denoiser.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:490640369540d1b0948352b75f880e215863a0de0b95a4b621ef590ee0e04e77
3
+ size 42661638
denoiser.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from librosa import stft, istft
3
+
4
+ from resampy.core import resample
5
+
6
+ stft_hop_length = 420
7
+ win_length = n_fft = 4 * stft_hop_length
8
+
9
+ def _stft(x):
10
+ s = stft(x, window='hann', win_length=win_length, n_fft=n_fft, hop_length=stft_hop_length,
11
+ center=True, pad_mode='reflect')
12
+
13
+ s = s[..., :-1]
14
+
15
+ mag = np.abs(s)
16
+
17
+ phi = np.angle(s)
18
+ cos = np.cos(phi)
19
+ sin = np.sin(phi)
20
+
21
+ return mag, cos, sin
22
+
23
+ def _istft(mag: np.array, cos: np.array, sin: np.array):
24
+ real = mag * cos
25
+ imag = mag * sin
26
+
27
+ s = real + imag * 1.0j
28
+ s = np.pad(s, ((0, 0), (0, 0), (0, 1)), mode='edge')
29
+ x = istft(s, window='hann', win_length=win_length, hop_length=stft_hop_length, n_fft=n_fft)
30
+ return x
31
+
32
+ def model(onnx_session, wav: np.array) -> np.array:
33
+ padded_wav = np.pad(wav, ((0,0), (0, 441)))
34
+
35
+ mag, cos, sin = _stft(padded_wav) # (b nfft/2 t)
36
+
37
+ ort_inputs = {
38
+ "mag": mag,
39
+ "cos": cos,
40
+ "sin": sin,
41
+ }
42
+
43
+ sep_mag, sep_cos, sep_sin = onnx_session.run(None, ort_inputs)
44
+
45
+ o = _istft(sep_mag, sep_cos, sep_sin)
46
+
47
+ o = o[:wav.shape[-1]]
48
+ return o
49
+
50
+ def run(onnx_session, wav: np.array, sample_rate: int, batch_process_chunks = False) -> np.array:
51
+ assert wav.ndim == 1, 'Input should be 1D (mono) wav'
52
+
53
+ if sample_rate != 44_100:
54
+ wav = resample(wav, sample_rate, 44_100, filter='kaiser_best', parallel=True)
55
+
56
+ chunk_length = int(sample_rate * 30)
57
+ #overlap_length = int(sr * overlap_seconds)
58
+ hop_length = chunk_length # - overlap_length
59
+
60
+ num_chunks = 1 + (wav.shape[-1] - 1) // hop_length
61
+ n_pad = (num_chunks - wav.shape[-1] % num_chunks) % num_chunks
62
+ wav = np.pad(wav, (0, n_pad))
63
+
64
+ chunks = np.reshape(wav, (num_chunks, -1))
65
+ abs_max = np.clip(np.max(np.abs(chunks), axis = -1, keepdims = True), a_min=1e-7, a_max=None)
66
+ chunks /= abs_max
67
+
68
+ if batch_process_chunks:
69
+ res_chunks = model(onnx_session, chunks)
70
+ else:
71
+ res_chunks = np.array([model(onnx_session, c[None]) for c in chunks]).squeeze(axis=1)
72
+ res_chunks *= abs_max
73
+
74
+ res = np.reshape(res_chunks, (-1))
75
+ return res[:wav.shape[-1]], 44_100
denoiser_output.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0124974f9ddd0d806bf78647c2101b1a205684288154829642146069cb069367
3
+ size 3496474
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ libsox-dev
pyproject.toml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ [tool.black]
2
+ line-length = 120
3
+ target-version = ['py310']
4
+
5
+ [tool.isort]
6
+ line_length = 120
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ numpy
2
+ scipy
3
+ librosa
4
+ resampy
5
+
6
+ onnxruntime