mariesig commited on
Commit
25d15ee
·
1 Parent(s): a11d9c9

initial demo

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitattributes copy ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+
2
+ pyproject.toml
3
+ __pycache__/
4
+ .gradio/
5
+ app.py.lprof
6
+ .DS_Store
7
+ .venv/
8
+ .ruff_cache/
9
+ pyrightconfig.json
aic_api.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ from typing import Any
4
+ import aiofiles
5
+ from aiofiles import os as aiofiles_os
6
+ import aiohttp
7
+ import time
8
+ from pathlib import Path
9
+ from constants import API_V2_URL, CHUNK_SIZE, TIMEOUT_FACTOR_MB, BASE_TIMEOUT_SECONDS
10
+ import os
11
+
12
+
13
+ class ApiParamsV2:
14
+ def __init__(
15
+ self,
16
+ enhancement_level: float = 100.0,
17
+ api_key: str = "",
18
+ enhancement_model: str = "LARK_V2",
19
+ loudness_target: float = -14,
20
+ true_peak: float = -1,
21
+ transcode: str = "WAV",
22
+ ):
23
+ self.api_key = api_key
24
+ self.enhancement_level = enhancement_level
25
+ self.enhancement_model = enhancement_model
26
+ self.loudness_target = loudness_target
27
+ self.true_peak = true_peak
28
+ self.transcode = transcode
29
+
30
+
31
+ # --------------------------------------------------------------
32
+
33
+
34
+ async def upload_and_enhance_v2(
35
+ url: str,
36
+ file_path: str,
37
+ api_key: str,
38
+ arguments: dict[str, Any],
39
+ ) -> str | None:
40
+ form_data = aiohttp.FormData()
41
+ form_data.add_field("media_enhancement", json.dumps(arguments))
42
+
43
+ async with aiofiles.open(file_path, "rb") as file:
44
+ form_data.add_field(
45
+ "file",
46
+ file,
47
+ content_type="application/octet-stream",
48
+ filename=Path(file_path).name,
49
+ )
50
+
51
+ async with aiohttp.ClientSession(headers={"X-API-Key": api_key}) as session:
52
+ async with session.post(url, data=form_data) as response:
53
+ if response.status != 201:
54
+ response_text = await response.text()
55
+ print(f"Error occured: {response_text}")
56
+ return None
57
+
58
+ response_json = await response.json()
59
+ uid = response_json["uid"]
60
+ print(f"Uploaded file's uid: {uid}")
61
+ return uid
62
+
63
+
64
+ async def download_enhanced_media_v2(
65
+ url: str,
66
+ output_file_path: str,
67
+ api_key: str,
68
+ ) -> int:
69
+ async with aiohttp.ClientSession(headers={"X-API-Key": api_key}) as session:
70
+ async with session.get(url) as response:
71
+ if response.status == 200:
72
+ await aiofiles_os.makedirs(Path(output_file_path).parent, exist_ok=True)
73
+ async with aiofiles.open(output_file_path, "wb") as f:
74
+ async for chunk in response.content.iter_chunked(CHUNK_SIZE):
75
+ await f.write(chunk)
76
+ print(f"Download successfully to: {output_file_path}")
77
+ return response.status
78
+
79
+
80
+ def process_file_v2(input_file_path: str, output_file_path: str, params: ApiParamsV2) -> None:
81
+ api_key = params.api_key
82
+ arguments = {
83
+ "enhancement_level": params.enhancement_level,
84
+ "enhancement_model": params.enhancement_model,
85
+ "loudness_target": params.loudness_target,
86
+ "true_peak": params.true_peak,
87
+ "transcode": params.transcode,
88
+ }
89
+ url = f"{API_V2_URL}/medias"
90
+
91
+ generated_name = asyncio.run(upload_and_enhance_v2(url, input_file_path, api_key, arguments))
92
+
93
+ if generated_name is None:
94
+ raise ValueError("API Key not found or invalid. Please check your API key.")
95
+
96
+ response = 412
97
+ file_size_bytes = os.path.getsize(input_file_path)
98
+ file_size_mb = file_size_bytes / (1024 * 1024)
99
+ timeout_seconds = int(file_size_mb * TIMEOUT_FACTOR_MB) + BASE_TIMEOUT_SECONDS
100
+ start_time = time.time()
101
+ while response == 412 and time.time() - start_time < timeout_seconds:
102
+ time.sleep(5)
103
+ url = f"{API_V2_URL}/medias/{generated_name}/file"
104
+ response = asyncio.run(download_enhanced_media_v2(url, output_file_path, api_key))
105
+ if response == 412:
106
+ raise TimeoutError(f"Download timed out after {timeout_seconds} seconds. Please try again.")
aic_sdk.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from dotenv import load_dotenv
3
+ from aic import Model, AICModelType, AICParameter
4
+ import librosa
5
+ import soundfile as sf
6
+
7
+ load_dotenv()
8
+
9
+
10
+ class SDKParams:
11
+ def __init__(self, enhancement_level: float, sdk_key: str):
12
+ self.enhancement_level = enhancement_level
13
+ self.sdk_key = sdk_key
14
+
15
+
16
+ def process_file_sdk(input_path: str, output_path: str, sdk_params: SDKParams):
17
+ # Load audio
18
+ audio, sample_rate = librosa.load(input_path, sr=48000, mono=True)
19
+ audio = audio.reshape(1, -1) # Convert to planar format
20
+ output = np.zeros_like(audio)
21
+ load_dotenv()
22
+ with Model(
23
+ AICModelType.QUAIL_L,
24
+ license_key=sdk_params.sdk_key,
25
+ sample_rate=48000,
26
+ channels=1,
27
+ frames=480,
28
+ ) as model:
29
+ model.set_parameter(AICParameter.ENHANCEMENT_LEVEL, sdk_params.enhancement_level)
30
+ # Process in chunks
31
+ chunk_size = 480
32
+ for i in range(0, audio.shape[1], chunk_size):
33
+ chunk = audio[:, i : i + chunk_size]
34
+ # Pad last chunk if needed
35
+ if chunk.shape[1] < chunk_size:
36
+ last_chunk_size = chunk.shape[1]
37
+ padded = np.zeros((1, chunk_size), dtype=audio.dtype)
38
+ padded[:, : chunk.shape[1]] = chunk
39
+ chunk = padded
40
+ enhanced_chunk = model.process(chunk)
41
+ output[:, i : i + last_chunk_size] = enhanced_chunk[:, :last_chunk_size]
42
+ break
43
+ enhanced_chunk = model.process(chunk)
44
+ output[:, i : i + chunk_size] = enhanced_chunk[:, :chunk_size]
45
+ # Save result
46
+ sf.write(output_path, output.T, sample_rate)
app.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ===============================
2
+ # Cache Cleanup (before any Gradio usage)
3
+ # ===============================
4
+ import os
5
+ import time
6
+ from typing import Optional, Any
7
+
8
+ import gradio as gr
9
+ from loguru import logger
10
+ from PIL import Image
11
+
12
+ from constants import (
13
+ NOISES,
14
+ SNR_LEVELS,
15
+ ENHANCEMENT_MODELS,
16
+ EXAMPLES_DIR,
17
+ NOISE_TYPES,
18
+ MINUTES_KEEP,
19
+ )
20
+ from aic_api import ApiParamsV2, process_file_v2
21
+ from aic_sdk import SDKParams, process_file_sdk
22
+ from audio_tools import spec_image, mix_at_snr
23
+ import shutil
24
+ import tempfile
25
+
26
+
27
+ # ===============================
28
+ # Temporary File & Cache Management
29
+ # ===============================
30
+
31
+
32
+ def cleanup_tmp(minutes_keep: int = MINUTES_KEEP, filter: list[str] = []):
33
+ if os.path.exists("/tmp"):
34
+ for root, _, files in os.walk("/tmp"):
35
+ for name in files:
36
+ f = os.path.join(root, name)
37
+ is_old = (time.time() - os.path.getmtime(f)) / 60 > minutes_keep
38
+ filtered = any(filt in f for filt in filter)
39
+ if filtered:
40
+ logger.info(f"Skipped file {f} (filtered)")
41
+ continue
42
+ if not is_old:
43
+ logger.info(f"Skipped file {f} (not old)")
44
+ continue
45
+ try:
46
+ os.remove(f)
47
+ logger.info(f"Removed file {f}")
48
+ except Exception as e:
49
+ logger.warning(f"Failed to remove file {f}: {e}")
50
+
51
+
52
+ # ===============================
53
+ # Interface Logic
54
+ # ===============================
55
+
56
+
57
+ def denoise_audio(
58
+ sample_path: str,
59
+ noise_type: str,
60
+ snr: str,
61
+ enhancement_level: float = 50.0,
62
+ enhancement_model: str = "FINCH",
63
+ api_key: str = "",
64
+ ) -> tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
65
+ base, ext = os.path.splitext(sample_path)
66
+ enhanced_path = f"{base}_enhanced{ext}"
67
+ noisy_path = f"{base}_noisy{ext}"
68
+ noisy_spec_path = f"{base}_noisy_spectrogram.png"
69
+ enhanced_spec_path = f"{base}_enhanced_spectrogram.png"
70
+ if noise_type != "None":
71
+ clipped = mix_at_snr(
72
+ signal_path=sample_path,
73
+ noise_path=NOISES[noise_type],
74
+ output_path=noisy_path,
75
+ snr_db=int(snr),
76
+ )
77
+ if clipped:
78
+ gr.Warning("Adding noise caused clipping. Normalizing might alter the SNR.")
79
+ else:
80
+ noisy_path = sample_path
81
+ try:
82
+ if enhancement_model == "QUAIL":
83
+ sdk_key = os.getenv("SECRET_SDK_KEY")
84
+ sdk_params = SDKParams(
85
+ enhancement_level=enhancement_level / 100,
86
+ sdk_key=sdk_key,
87
+ )
88
+ process_file_sdk(noisy_path, enhanced_path, sdk_params)
89
+ else:
90
+ api_params = ApiParamsV2(
91
+ enhancement_level=enhancement_level,
92
+ enhancement_model=enhancement_model,
93
+ api_key=api_key,
94
+ )
95
+ process_file_v2(noisy_path, enhanced_path, api_params)
96
+ except Exception as e:
97
+ gr.Warning(f"{e}")
98
+ return None, None, None, None
99
+ noisy_im = spec_image(noisy_path)
100
+
101
+ noisy_im.save(noisy_spec_path)
102
+ enhanced_im = spec_image(enhanced_path)
103
+ enhanced_im.save(enhanced_spec_path)
104
+ print(f"Enhancement complete. id: {base}")
105
+ return noisy_path, noisy_spec_path, enhanced_path, enhanced_spec_path
106
+
107
+
108
+ def pick_example(
109
+ sample_path: str,
110
+ enhancement_level: float = 100.0,
111
+ enhancement_model: str = "FINCH",
112
+ ) -> tuple[str, str, float, str, str, Image.Image, str, Image.Image]:
113
+ """
114
+ Returns precomputed noisy/enhanced files and images for the given example.
115
+ """
116
+ sample_name = os.path.basename(sample_path)
117
+ enhanced_path = f"assets/samples/enhanced/{sample_name}"
118
+ noisy_im = spec_image(sample_path)
119
+ enhanced_im = spec_image(enhanced_path)
120
+
121
+ return (
122
+ sample_path,
123
+ "None",
124
+ enhancement_level,
125
+ enhancement_model,
126
+ sample_path,
127
+ noisy_im,
128
+ enhanced_path,
129
+ enhanced_im,
130
+ )
131
+
132
+
133
+ def toggle_audio_input(choice: str):
134
+ if choice == "mic":
135
+ return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
136
+ else:
137
+ return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
138
+
139
+
140
+ def toggle_SNR(choice: str):
141
+ if choice == "None":
142
+ return gr.update(visible=False, value="None")
143
+ else:
144
+ return gr.update(visible=True, value="10")
145
+
146
+
147
+ def delete_previous_enhancement(path_to_delete: str):
148
+ if not path_to_delete:
149
+ return
150
+ filename_no_ext = os.path.splitext(os.path.basename(path_to_delete))[0]
151
+ # Delete files containing filename_no_ext at all levels under /tmp
152
+ base_dir = "/tmp"
153
+ deleted = False
154
+ try:
155
+ for root, _, files in os.walk(base_dir):
156
+ for f in files:
157
+ if filename_no_ext in f:
158
+ full_path = os.path.join(root, f)
159
+ os.remove(full_path)
160
+ logger.info(f"Deleted file {full_path}")
161
+ deleted = True
162
+ if not deleted:
163
+ logger.warning(f"No files found to delete containing '{filename_no_ext}' in {base_dir}")
164
+ except Exception as e:
165
+ logger.warning(f"Failed to delete files containing '{filename_no_ext}' in {base_dir}: {e}")
166
+
167
+
168
+ def start_processing(
169
+ sample_path: str, model_radio: str, api_key: str, mic_input: Optional[str] = None
170
+ ) -> tuple[Any, Any, str]:
171
+ success = True
172
+ sample_path = mic_input if mic_input else sample_path
173
+ if not sample_path:
174
+ gr.Warning("Please provide an audio sample or use the microphone input.")
175
+ success = False
176
+ if not api_key and model_radio in ["FINCH", "LARK_V2"]:
177
+ gr.Warning("No API key provided. Please get one from https://ai-coustics.com/api/.")
178
+ success = False
179
+ if not os.getenv("SECRET_SDK_KEY") and model_radio == "QUAIL":
180
+ gr.Warning("No SDK key provided. Please contact us at https://ai-coustics.com/contact/.")
181
+ success = False
182
+ if not success:
183
+ raise ValueError("Missing audio sample or API/SDK key.")
184
+ gr.Info(
185
+ "Processing started. This may take a moment. Please do not refresh or close the window."
186
+ )
187
+ # Generate a new /tmp path with a unique filename using tempfile
188
+ ext = os.path.splitext(sample_path)[1]
189
+ with tempfile.NamedTemporaryFile(delete=False, suffix=ext, dir="/tmp") as tmp_file:
190
+ shutil.copy(sample_path, tmp_file.name)
191
+ input_enhancement_path = tmp_file.name
192
+ return gr.update(visible=False), gr.update(interactive=False), input_enhancement_path
193
+
194
+
195
+ def enable_new_input():
196
+ return gr.update(visible=True), gr.update(interactive=True)
197
+
198
+
199
+ # ===============================
200
+ # Gradio UI Layout
201
+ # ===============================
202
+
203
+ # Hidden state variable, not shown in UI
204
+ with gr.Blocks(delete_cache=(7200, 7200)) as demo:
205
+ input_enhancement = gr.State()
206
+ with gr.Row():
207
+ gr.Markdown(
208
+ "[![AI-Coustics Logo](https://mintcdn.com/ai-coustics/Sxcrv8jVSE2qWMR1/logo/dark.svg?fit=max&auto=format&n=Sxcrv8jVSE2qWMR1&q=85&s=7f26caaf21e963912961cbd8541e6d84)](https://ai-coustics.com/)",
209
+ )
210
+ with gr.Row():
211
+ gr.Markdown(open("intro.md").read())
212
+ with gr.Row():
213
+ with gr.Column():
214
+ api_key = gr.Textbox(
215
+ label="AI-Coustics API Key",
216
+ placeholder="Paste your API key here",
217
+ type="password",
218
+ value="",
219
+ )
220
+ gr.Markdown("Don't have an API key? [Get one here](https://ai-coustics.com/api/).")
221
+ radio = gr.Radio(
222
+ ["mic", "file"],
223
+ value="file",
224
+ label="How would you like to upload your audio?",
225
+ )
226
+ mic_input = gr.Mic(label="Input", type="filepath", visible=False)
227
+ audio_file = gr.Audio(type="filepath", label="Input", visible=True)
228
+ noise_type = gr.Dropdown(
229
+ label="Add noise",
230
+ choices=[*NOISE_TYPES],
231
+ value="None",
232
+ )
233
+ noise_level = gr.Dropdown(
234
+ label="Noise Level (SNR)",
235
+ choices=[*SNR_LEVELS],
236
+ value="None",
237
+ visible=False,
238
+ )
239
+ percent_slider = gr.Slider(
240
+ minimum=1,
241
+ maximum=100,
242
+ value=100,
243
+ step=1,
244
+ label="Enhancement Level (%)",
245
+ info=(
246
+ "Set how much enhancement to apply. "
247
+ "Lower values are more subtle, higher values are stronger."
248
+ ),
249
+ )
250
+ model_radio = gr.Radio(
251
+ [*ENHANCEMENT_MODELS],
252
+ value="FINCH",
253
+ label="Select Model",
254
+ info=(
255
+ "FINCH: specialized on voice isolation/removing background noise. "
256
+ "LARK_V2: advanced speech enhancement/improvement of audio quality. "
257
+ "QUAIL: specialized in real-time audio enhancement."
258
+ ),
259
+ )
260
+ btn = gr.Button("Enhance")
261
+
262
+ with gr.Column():
263
+ noisy_audio = gr.Audio(type="filepath", label="Noisy audio")
264
+ noisy_image = gr.Image(label="Noisy spectrogram", format="png", type="filepath")
265
+ enhanced_audio = gr.Audio(type="filepath", label="Enhanced audio")
266
+ enhanced_image = gr.Image(label="Enhanced spectrogram", format="png", type="filepath")
267
+ with gr.Row() as examples_group:
268
+ examples = gr.Examples(
269
+ examples=EXAMPLES_DIR,
270
+ fn=pick_example,
271
+ inputs=[audio_file, percent_slider, model_radio],
272
+ outputs=[
273
+ audio_file,
274
+ noise_type,
275
+ percent_slider,
276
+ model_radio,
277
+ noisy_audio,
278
+ noisy_image,
279
+ enhanced_audio,
280
+ enhanced_image,
281
+ ],
282
+ cache_examples=True,
283
+ )
284
+
285
+ btn.click(delete_previous_enhancement, input_enhancement, None).then(
286
+ start_processing,
287
+ inputs=[audio_file, model_radio, api_key, mic_input],
288
+ outputs=[examples_group, btn, input_enhancement],
289
+ ).success(
290
+ denoise_audio,
291
+ inputs=[
292
+ input_enhancement,
293
+ noise_type,
294
+ noise_level,
295
+ percent_slider,
296
+ model_radio,
297
+ api_key,
298
+ ],
299
+ outputs=[noisy_audio, noisy_image, enhanced_audio, enhanced_image],
300
+ ).then(enable_new_input, None, [examples_group, btn])
301
+ radio.change(toggle_audio_input, radio, [mic_input, audio_file])
302
+ noise_type.change(toggle_SNR, noise_type, noise_level)
303
+
304
+ cleanup_tmp(minutes_keep=0, filter=[])
305
+ demo.launch(allowed_paths=["/tmp", "/"])
assets/samples/enhanced/Background.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51b4a5c2fefa0009f4c9fa277b194fad1c90fd3e1c120f53b35cbdc66bec6397
3
+ size 1441752
assets/samples/enhanced/Distortion.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76540ad7ef85b21adf4c656f4a3d7dd5244e98906544a184cbdf1885d717b0bf
3
+ size 1591938
assets/samples/enhanced/Music.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f955a147d7552a90ad148d8c4946f021f5a0fcb027d8399874266dbeb0b6f7d
3
+ size 1247876
assets/samples/enhanced/Reverb.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b20c7c1d90e2d2f2c108a58c10f037d7cfe25223f6aafb5e1ef63af899a9078
3
+ size 2211758
assets/samples/enhanced/Wind.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8082f10ba32bb361e9990e856dfc59297d28cdae26be3ee7ce4735ed2b4c3231
3
+ size 1242414
assets/samples/extra_noise/noise0.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79b5c697344ee21901bcafce98b80ee9fa2577e0a6722f40eff2a8347bdd695f
3
+ size 960044
assets/samples/extra_noise/noise1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ee047bf39ead2300fe81b922dc9c6aac39338d2fc7f3c57e2efb6d856e4b5d7
3
+ size 960044
assets/samples/extra_noise/noise2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f29b0ac90468f2395f0d363d0c215be2dc928893bb84ca3cd3a0bc4be5bd4ace
3
+ size 960044
assets/samples/input/Background.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f62505d5b6099cf35827a3e8f9c1cd08bbadc8a7623a361bf72839dc7db7225
3
+ size 3973782
assets/samples/input/Distortion.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:854895fc4f3331331954f569c95540cae0a344a10b82d83fd6fd7dafbdcd2fa1
3
+ size 4387734
assets/samples/input/Music.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e331b37d45931ecf81e229e4b9449cdffb38e8a39f2d385bc3a60d894006bae6
3
+ size 3439410
assets/samples/input/Reverb.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac834e42c47ce78b15c649e175d71eea3a1da4a4a22ad2abed5307c663e99133
3
+ size 4064102
assets/samples/input/Wind.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e18809e079691fdea30e381f1b881a30c879ccff3556f0dc79b42ecc8083eb36
3
+ size 3424358
audio_tools.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ import numpy as np
3
+ import librosa
4
+ from PIL import Image
5
+ import io
6
+ import matplotlib.pyplot as plt
7
+ import soundfile as sf
8
+
9
+
10
+ def spec_image(
11
+ audio_wav: str,
12
+ n_fft: int = 2048,
13
+ hop_length: int = 512,
14
+ n_mels: int = 128,
15
+ fmax: Optional[float] = None,
16
+ ) -> Image.Image:
17
+ """
18
+ Generate a mel-spectrogram image from an audio file.
19
+ """
20
+ y, sr = librosa.load(audio_wav, mono=True, sr=None)
21
+ S = librosa.feature.melspectrogram(
22
+ y=y,
23
+ sr=sr,
24
+ n_fft=n_fft,
25
+ hop_length=hop_length,
26
+ n_mels=n_mels,
27
+ fmax=fmax or sr // 2,
28
+ )
29
+ S_db = librosa.power_to_db(S, ref=np.max(S))
30
+ fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
31
+ img = librosa.display.specshow(
32
+ S_db, sr=sr, hop_length=hop_length, x_axis="time", y_axis="mel", ax=ax
33
+ )
34
+ cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
35
+ cbar.set_label("dB")
36
+ ax.set_title("Mel-spectrogram")
37
+ ax.set_xlabel("Time in s")
38
+ ax.set_ylabel("Frequency in Hz")
39
+ fig.tight_layout(pad=0.2)
40
+ buf = io.BytesIO()
41
+ fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
42
+ plt.close(fig)
43
+ buf.seek(0)
44
+ return Image.open(buf).convert("RGB")
45
+
46
+
47
+ def mix_at_snr(
48
+ signal_path: str,
49
+ noise_path: str,
50
+ output_path: str = "output.wav",
51
+ snr_db: float = 10.0,
52
+ rng: Optional[np.random.Generator] = None,
53
+ ) -> bool:
54
+ """
55
+ Mix noise into clean audio at a target SNR (in dB).
56
+
57
+ Args:
58
+ signal_wav: Path to clean/foreground audio (wav/mp3).
59
+ noise_wav: Path to noise audio (wav/mp3).
60
+ snr_db: Desired SNR in dB (signal/noise).
61
+ normalize: If True, peak-normalize the mixture to |x|max=1 after mixing
62
+ (note: can slightly alter achieved SNR).
63
+ rng: Optional numpy Generator for reproducible random cropping.
64
+
65
+ Returns:
66
+ Bool wether clipping occurred.
67
+ """
68
+ clipped = False
69
+ rng = rng or np.random.default_rng()
70
+
71
+ sig, sr_s = librosa.load(signal_path, mono=True, sr=None)
72
+ noise, sr_n = librosa.load(noise_path, mono=True, sr=None)
73
+
74
+ # Resample noise if needed
75
+ if sr_s != sr_n:
76
+ noise = librosa.resample(noise, orig_sr=sr_n, target_sr=sr_s, res_type="kaiser_best")
77
+
78
+ # Match lengths:
79
+ L = len(sig)
80
+ if len(noise) < L:
81
+ reps = int(np.ceil(L / len(noise)))
82
+ noise = np.tile(noise, reps)[:L]
83
+ else:
84
+ start = rng.integers(0, len(noise) - L + 1) if len(noise) > L else 0
85
+ noise = noise[start : start + L]
86
+
87
+ sig_power = float(np.mean(sig**2))
88
+ noise_power = float(np.mean(noise**2))
89
+
90
+ if sig_power == 0.0:
91
+ out = noise * 0.0
92
+ elif noise_power == 0.0:
93
+ out = sig.copy()
94
+ else:
95
+ target_noise_power = sig_power / (10.0 ** (snr_db / 10.0))
96
+ scale = np.sqrt(target_noise_power / noise_power)
97
+ noise_scaled = noise * scale
98
+ out = sig + noise_scaled
99
+
100
+ peak = np.max(np.abs(out)) or 1.0
101
+ if peak > 1.0:
102
+ clipped = True
103
+ out = out / peak
104
+ sf.write(output_path, out, sr_s)
105
+ return clipped
constants.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Final
2
+
3
+
4
+ ENHANCEMENT_MODELS: Final = ["FINCH", "LARK_V2", "QUAIL"]
5
+ API_V2_URL: Final = "https://api.ai-coustics.io/v2"
6
+ CHUNK_SIZE: Final = 1024
7
+ TIMEOUT_FACTOR_MB: Final = 60
8
+ BASE_TIMEOUT_SECONDS: Final = 120
9
+
10
+
11
+ MINUTES_KEEP: Final = 60
12
+
13
+ NOISES: Final = {
14
+ "None": "None",
15
+ "Noise_0": "assets/samples/extra_noise/noise0.wav",
16
+ "Noise_1": "assets/samples/extra_noise/noise1.wav",
17
+ "Noise_2": "assets/samples/extra_noise/noise2.wav",
18
+ }
19
+ NOISE_TYPES: Final = list(NOISES.keys())
20
+ SNR_LEVELS: Final = ["None", "-5", "0", "10", "20"]
21
+ EXAMPLES: Final = ["Background", "Reverb", "Distortion", "Wind", "Music"]
22
+ EXAMPLES_DIR: Final = [
23
+ [
24
+ "assets/samples/input/Background.wav",
25
+ 100,
26
+ "FINCH",
27
+ ],
28
+ [
29
+ "assets/samples/input/Reverb.wav",
30
+ 100,
31
+ "QUAIL",
32
+ ],
33
+ [
34
+ "assets/samples/input/Distortion.wav",
35
+ 100,
36
+ "LARK_V2",
37
+ ],
38
+ [
39
+ "assets/samples/input/Wind.wav",
40
+ 100,
41
+ "LARK_V2",
42
+ ],
43
+ [
44
+ "assets/samples/input/Music.wav",
45
+ 100,
46
+ "LARK_V2",
47
+ ],
48
+ ]
intro.md ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ Welcome! This interactive demo allows you to denoise and enhance audio files using AI-Coustics models.
2
+ Learn more about our technology and its capabilities at [AI-Coustics](https://ai-coustics.com/).
3
+
4
+ **How to Use:**
5
+
6
+ Upload or record a (noisy) speech sample to enhance its quality. You can optionally add background noise to the input. To generate new enhanced audio, you'll need an API key. Alternatively, you can preview some enhancement results instantly by selecting one of the preprocessed examples below — no API key required.
7
+
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Requires Python 3.11
2
+ matplotlib>=3.8,<3.10
3
+ soundfile>=0.12.1
4
+ aiohttp>=3.9,<4
5
+ librosa>=0.10.1,<0.11
6
+ loguru~=0.7
7
+ aic-sdk
8
+ dotenv
9
+ resampy