Spaces:

ai-coustics
/

speechEnhancement

Running on CPU Upgrade

App Files Files Community

mariesig commited on Sep 24, 2025

Commit

25d15ee

1 Parent(s): a11d9c9

initial demo

Browse files

Files changed (24) hide show

.gitattributes +1 -0
.gitattributes copy +36 -0
.gitignore +9 -0
aic_api.py +106 -0
aic_sdk.py +46 -0
app.py +305 -0
assets/samples/enhanced/Background.wav +3 -0
assets/samples/enhanced/Distortion.wav +3 -0
assets/samples/enhanced/Music.wav +3 -0
assets/samples/enhanced/Reverb.wav +3 -0
assets/samples/enhanced/Wind.wav +3 -0
assets/samples/extra_noise/noise0.wav +3 -0
assets/samples/extra_noise/noise1.wav +3 -0
assets/samples/extra_noise/noise2.wav +3 -0
assets/samples/input/Background.wav +3 -0
assets/samples/input/Distortion.wav +3 -0
assets/samples/input/Music.wav +3 -0
assets/samples/input/Reverb.wav +3 -0
assets/samples/input/Wind.wav +3 -0
audio_tools.py +105 -0
constants.py +48 -0
intro.md +7 -0
packages.txt +1 -0
requirements.txt +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

.gitattributes copy ADDED Viewed

	@@ -0,0 +1,36 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,9 @@

+pyproject.toml
+__pycache__/
+.gradio/
+app.py.lprof
+.DS_Store
+.venv/
+.ruff_cache/
+pyrightconfig.json

aic_api.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import asyncio
+import json
+from typing import Any
+import aiofiles
+from aiofiles import os as aiofiles_os
+import aiohttp
+import time
+from pathlib import Path
+from constants import API_V2_URL, CHUNK_SIZE, TIMEOUT_FACTOR_MB, BASE_TIMEOUT_SECONDS
+import os
+class ApiParamsV2:
+    def __init__(
+        self,
+        enhancement_level: float = 100.0,
+        api_key: str = "",
+        enhancement_model: str = "LARK_V2",
+        loudness_target: float = -14,
+        true_peak: float = -1,
+        transcode: str = "WAV",
+    ):
+        self.api_key = api_key
+        self.enhancement_level = enhancement_level
+        self.enhancement_model = enhancement_model
+        self.loudness_target = loudness_target
+        self.true_peak = true_peak
+        self.transcode = transcode
+# --------------------------------------------------------------
+async def upload_and_enhance_v2(
+    url: str,
+    file_path: str,
+    api_key: str,
+    arguments: dict[str, Any],
+) -> str | None:
+    form_data = aiohttp.FormData()
+    form_data.add_field("media_enhancement", json.dumps(arguments))
+    async with aiofiles.open(file_path, "rb") as file:
+        form_data.add_field(
+            "file",
+            file,
+            content_type="application/octet-stream",
+            filename=Path(file_path).name,
+        )
+        async with aiohttp.ClientSession(headers={"X-API-Key": api_key}) as session:
+            async with session.post(url, data=form_data) as response:
+                if response.status != 201:
+                    response_text = await response.text()
+                    print(f"Error occured: {response_text}")
+                    return None
+                response_json = await response.json()
+                uid = response_json["uid"]
+                print(f"Uploaded file's uid: {uid}")
+                return uid
+async def download_enhanced_media_v2(
+    url: str,
+    output_file_path: str,
+    api_key: str,
+) -> int:
+    async with aiohttp.ClientSession(headers={"X-API-Key": api_key}) as session:
+        async with session.get(url) as response:
+            if response.status == 200:
+                await aiofiles_os.makedirs(Path(output_file_path).parent, exist_ok=True)
+                async with aiofiles.open(output_file_path, "wb") as f:
+                    async for chunk in response.content.iter_chunked(CHUNK_SIZE):
+                        await f.write(chunk)
+                print(f"Download successfully to: {output_file_path}")
+    return response.status
+def process_file_v2(input_file_path: str, output_file_path: str, params: ApiParamsV2) -> None:
+    api_key = params.api_key
+    arguments = {
+        "enhancement_level": params.enhancement_level,
+        "enhancement_model": params.enhancement_model,
+        "loudness_target": params.loudness_target,
+        "true_peak": params.true_peak,
+        "transcode": params.transcode,
+    }
+    url = f"{API_V2_URL}/medias"
+    generated_name = asyncio.run(upload_and_enhance_v2(url, input_file_path, api_key, arguments))
+    if generated_name is None:
+        raise ValueError("API Key not found or invalid. Please check your API key.")
+    response = 412
+    file_size_bytes = os.path.getsize(input_file_path)
+    file_size_mb = file_size_bytes / (1024 * 1024)
+    timeout_seconds = int(file_size_mb * TIMEOUT_FACTOR_MB) + BASE_TIMEOUT_SECONDS
+    start_time = time.time()
+    while response == 412 and time.time() - start_time < timeout_seconds:
+        time.sleep(5)
+        url = f"{API_V2_URL}/medias/{generated_name}/file"
+        response = asyncio.run(download_enhanced_media_v2(url, output_file_path, api_key))
+    if response == 412:
+        raise TimeoutError(f"Download timed out after {timeout_seconds} seconds. Please try again.")

aic_sdk.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import numpy as np
+from dotenv import load_dotenv
+from aic import Model, AICModelType, AICParameter
+import librosa
+import soundfile as sf
+load_dotenv()
+class SDKParams:
+    def __init__(self, enhancement_level: float, sdk_key: str):
+        self.enhancement_level = enhancement_level
+        self.sdk_key = sdk_key
+def process_file_sdk(input_path: str, output_path: str, sdk_params: SDKParams):
+    # Load audio
+    audio, sample_rate = librosa.load(input_path, sr=48000, mono=True)
+    audio = audio.reshape(1, -1)  # Convert to planar format
+    output = np.zeros_like(audio)
+    load_dotenv()
+    with Model(
+        AICModelType.QUAIL_L,
+        license_key=sdk_params.sdk_key,
+        sample_rate=48000,
+        channels=1,
+        frames=480,
+    ) as model:
+        model.set_parameter(AICParameter.ENHANCEMENT_LEVEL, sdk_params.enhancement_level)
+        # Process in chunks
+        chunk_size = 480
+        for i in range(0, audio.shape[1], chunk_size):
+            chunk = audio[:, i : i + chunk_size]
+            # Pad last chunk if needed
+            if chunk.shape[1] < chunk_size:
+                last_chunk_size = chunk.shape[1]
+                padded = np.zeros((1, chunk_size), dtype=audio.dtype)
+                padded[:, : chunk.shape[1]] = chunk
+                chunk = padded
+                enhanced_chunk = model.process(chunk)
+                output[:, i : i + last_chunk_size] = enhanced_chunk[:, :last_chunk_size]
+                break
+            enhanced_chunk = model.process(chunk)
+            output[:, i : i + chunk_size] = enhanced_chunk[:, :chunk_size]
+    # Save result
+    sf.write(output_path, output.T, sample_rate)

app.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# ===============================
+# Cache Cleanup (before any Gradio usage)
+# ===============================
+import os
+import time
+from typing import Optional, Any
+import gradio as gr
+from loguru import logger
+from PIL import Image
+from constants import (
+    NOISES,
+    SNR_LEVELS,
+    ENHANCEMENT_MODELS,
+    EXAMPLES_DIR,
+    NOISE_TYPES,
+    MINUTES_KEEP,
+)
+from aic_api import ApiParamsV2, process_file_v2
+from aic_sdk import SDKParams, process_file_sdk
+from audio_tools import spec_image, mix_at_snr
+import shutil
+import tempfile
+# ===============================
+# Temporary File & Cache Management
+# ===============================
+def cleanup_tmp(minutes_keep: int = MINUTES_KEEP, filter: list[str] = []):
+    if os.path.exists("/tmp"):
+        for root, _, files in os.walk("/tmp"):
+            for name in files:
+                f = os.path.join(root, name)
+                is_old = (time.time() - os.path.getmtime(f)) / 60 > minutes_keep
+                filtered = any(filt in f for filt in filter)
+                if filtered:
+                    logger.info(f"Skipped file {f} (filtered)")
+                    continue
+                if not is_old:
+                    logger.info(f"Skipped file {f} (not old)")
+                    continue
+                try:
+                    os.remove(f)
+                    logger.info(f"Removed file {f}")
+                except Exception as e:
+                    logger.warning(f"Failed to remove file {f}: {e}")
+# ===============================
+# Interface Logic
+# ===============================
+def denoise_audio(
+    sample_path: str,
+    noise_type: str,
+    snr: str,
+    enhancement_level: float = 50.0,
+    enhancement_model: str = "FINCH",
+    api_key: str = "",
+) -> tuple[Optional[str], Optional[str], Optional[str], Optional[str]]:
+    base, ext = os.path.splitext(sample_path)
+    enhanced_path = f"{base}_enhanced{ext}"
+    noisy_path = f"{base}_noisy{ext}"
+    noisy_spec_path = f"{base}_noisy_spectrogram.png"
+    enhanced_spec_path = f"{base}_enhanced_spectrogram.png"
+    if noise_type != "None":
+        clipped = mix_at_snr(
+            signal_path=sample_path,
+            noise_path=NOISES[noise_type],
+            output_path=noisy_path,
+            snr_db=int(snr),
+        )
+        if clipped:
+            gr.Warning("Adding noise caused clipping. Normalizing might alter the SNR.")
+    else:
+        noisy_path = sample_path
+    try:
+        if enhancement_model == "QUAIL":
+            sdk_key = os.getenv("SECRET_SDK_KEY")
+            sdk_params = SDKParams(
+                enhancement_level=enhancement_level / 100,
+                sdk_key=sdk_key,
+            )
+            process_file_sdk(noisy_path, enhanced_path, sdk_params)
+        else:
+            api_params = ApiParamsV2(
+                enhancement_level=enhancement_level,
+                enhancement_model=enhancement_model,
+                api_key=api_key,
+            )
+            process_file_v2(noisy_path, enhanced_path, api_params)
+    except Exception as e:
+        gr.Warning(f"{e}")
+        return None, None, None, None
+    noisy_im = spec_image(noisy_path)
+    noisy_im.save(noisy_spec_path)
+    enhanced_im = spec_image(enhanced_path)
+    enhanced_im.save(enhanced_spec_path)
+    print(f"Enhancement complete. id: {base}")
+    return noisy_path, noisy_spec_path, enhanced_path, enhanced_spec_path
+def pick_example(
+    sample_path: str,
+    enhancement_level: float = 100.0,
+    enhancement_model: str = "FINCH",
+) -> tuple[str, str, float, str, str, Image.Image, str, Image.Image]:
+    """
+    Returns precomputed noisy/enhanced files and images for the given example.
+    """
+    sample_name = os.path.basename(sample_path)
+    enhanced_path = f"assets/samples/enhanced/{sample_name}"
+    noisy_im = spec_image(sample_path)
+    enhanced_im = spec_image(enhanced_path)
+    return (
+        sample_path,
+        "None",
+        enhancement_level,
+        enhancement_model,
+        sample_path,
+        noisy_im,
+        enhanced_path,
+        enhanced_im,
+    )
+def toggle_audio_input(choice: str):
+    if choice == "mic":
+        return gr.update(visible=True, value=None), gr.update(visible=False, value=None)
+    else:
+        return gr.update(visible=False, value=None), gr.update(visible=True, value=None)
+def toggle_SNR(choice: str):
+    if choice == "None":
+        return gr.update(visible=False, value="None")
+    else:
+        return gr.update(visible=True, value="10")
+def delete_previous_enhancement(path_to_delete: str):
+    if not path_to_delete:
+        return
+    filename_no_ext = os.path.splitext(os.path.basename(path_to_delete))[0]
+    # Delete files containing filename_no_ext at all levels under /tmp
+    base_dir = "/tmp"
+    deleted = False
+    try:
+        for root, _, files in os.walk(base_dir):
+            for f in files:
+                if filename_no_ext in f:
+                    full_path = os.path.join(root, f)
+                    os.remove(full_path)
+                    logger.info(f"Deleted file {full_path}")
+                    deleted = True
+        if not deleted:
+            logger.warning(f"No files found to delete containing '{filename_no_ext}' in {base_dir}")
+    except Exception as e:
+        logger.warning(f"Failed to delete files containing '{filename_no_ext}' in {base_dir}: {e}")
+def start_processing(
+    sample_path: str, model_radio: str, api_key: str, mic_input: Optional[str] = None
+) -> tuple[Any, Any, str]:
+    success = True
+    sample_path = mic_input if mic_input else sample_path
+    if not sample_path:
+        gr.Warning("Please provide an audio sample or use the microphone input.")
+        success = False
+    if not api_key and model_radio in ["FINCH", "LARK_V2"]:
+        gr.Warning("No API key provided. Please get one from https://ai-coustics.com/api/.")
+        success = False
+    if not os.getenv("SECRET_SDK_KEY") and model_radio == "QUAIL":
+        gr.Warning("No SDK key provided. Please contact us at https://ai-coustics.com/contact/.")
+        success = False
+    if not success:
+        raise ValueError("Missing audio sample or API/SDK key.")
+    gr.Info(
+        "Processing started. This may take a moment. Please do not refresh or close the window."
+    )
+    # Generate a new /tmp path with a unique filename using tempfile
+    ext = os.path.splitext(sample_path)[1]
+    with tempfile.NamedTemporaryFile(delete=False, suffix=ext, dir="/tmp") as tmp_file:
+        shutil.copy(sample_path, tmp_file.name)
+        input_enhancement_path = tmp_file.name
+    return gr.update(visible=False), gr.update(interactive=False), input_enhancement_path
+def enable_new_input():
+    return gr.update(visible=True), gr.update(interactive=True)
+# ===============================
+# Gradio UI Layout
+# ===============================
+# Hidden state variable, not shown in UI
+with gr.Blocks(delete_cache=(7200, 7200)) as demo:
+    input_enhancement = gr.State()
+    with gr.Row():
+        gr.Markdown(
+            "[![AI-Coustics Logo](https://mintcdn.com/ai-coustics/Sxcrv8jVSE2qWMR1/logo/dark.svg?fit=max&auto=format&n=Sxcrv8jVSE2qWMR1&q=85&s=7f26caaf21e963912961cbd8541e6d84)](https://ai-coustics.com/)",
+        )
+    with gr.Row():
+        gr.Markdown(open("intro.md").read())
+    with gr.Row():
+        with gr.Column():
+            api_key = gr.Textbox(
+                label="AI-Coustics API Key",
+                placeholder="Paste your API key here",
+                type="password",
+                value="",
+            )
+            gr.Markdown("Don't have an API key? [Get one here](https://ai-coustics.com/api/).")
+            radio = gr.Radio(
+                ["mic", "file"],
+                value="file",
+                label="How would you like to upload your audio?",
+            )
+            mic_input = gr.Mic(label="Input", type="filepath", visible=False)
+            audio_file = gr.Audio(type="filepath", label="Input", visible=True)
+            noise_type = gr.Dropdown(
+                label="Add noise",
+                choices=[*NOISE_TYPES],
+                value="None",
+            )
+            noise_level = gr.Dropdown(
+                label="Noise Level (SNR)",
+                choices=[*SNR_LEVELS],
+                value="None",
+                visible=False,
+            )
+            percent_slider = gr.Slider(
+                minimum=1,
+                maximum=100,
+                value=100,
+                step=1,
+                label="Enhancement Level (%)",
+                info=(
+                    "Set how much enhancement to apply. "
+                    "Lower values are more subtle, higher values are stronger."
+                ),
+            )
+            model_radio = gr.Radio(
+                [*ENHANCEMENT_MODELS],
+                value="FINCH",
+                label="Select Model",
+                info=(
+                    "FINCH: specialized on voice isolation/removing background noise. "
+                    "LARK_V2: advanced speech enhancement/improvement of audio quality. "
+                    "QUAIL: specialized in real-time audio enhancement."
+                ),
+            )
+            btn = gr.Button("Enhance")
+        with gr.Column():
+            noisy_audio = gr.Audio(type="filepath", label="Noisy audio")
+            noisy_image = gr.Image(label="Noisy spectrogram", format="png", type="filepath")
+            enhanced_audio = gr.Audio(type="filepath", label="Enhanced audio")
+            enhanced_image = gr.Image(label="Enhanced spectrogram", format="png", type="filepath")
+    with gr.Row() as examples_group:
+        examples = gr.Examples(
+            examples=EXAMPLES_DIR,
+            fn=pick_example,
+            inputs=[audio_file, percent_slider, model_radio],
+            outputs=[
+                audio_file,
+                noise_type,
+                percent_slider,
+                model_radio,
+                noisy_audio,
+                noisy_image,
+                enhanced_audio,
+                enhanced_image,
+            ],
+            cache_examples=True,
+        )
+    btn.click(delete_previous_enhancement, input_enhancement, None).then(
+        start_processing,
+        inputs=[audio_file, model_radio, api_key, mic_input],
+        outputs=[examples_group, btn, input_enhancement],
+    ).success(
+        denoise_audio,
+        inputs=[
+            input_enhancement,
+            noise_type,
+            noise_level,
+            percent_slider,
+            model_radio,
+            api_key,
+        ],
+        outputs=[noisy_audio, noisy_image, enhanced_audio, enhanced_image],
+    ).then(enable_new_input, None, [examples_group, btn])
+    radio.change(toggle_audio_input, radio, [mic_input, audio_file])
+    noise_type.change(toggle_SNR, noise_type, noise_level)
+cleanup_tmp(minutes_keep=0, filter=[])
+demo.launch(allowed_paths=["/tmp", "/"])

assets/samples/enhanced/Background.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51b4a5c2fefa0009f4c9fa277b194fad1c90fd3e1c120f53b35cbdc66bec6397
+size 1441752

assets/samples/enhanced/Distortion.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76540ad7ef85b21adf4c656f4a3d7dd5244e98906544a184cbdf1885d717b0bf
+size 1591938

assets/samples/enhanced/Music.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f955a147d7552a90ad148d8c4946f021f5a0fcb027d8399874266dbeb0b6f7d
+size 1247876

assets/samples/enhanced/Reverb.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b20c7c1d90e2d2f2c108a58c10f037d7cfe25223f6aafb5e1ef63af899a9078
+size 2211758

assets/samples/enhanced/Wind.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8082f10ba32bb361e9990e856dfc59297d28cdae26be3ee7ce4735ed2b4c3231
+size 1242414

assets/samples/extra_noise/noise0.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:79b5c697344ee21901bcafce98b80ee9fa2577e0a6722f40eff2a8347bdd695f
+size 960044

assets/samples/extra_noise/noise1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7ee047bf39ead2300fe81b922dc9c6aac39338d2fc7f3c57e2efb6d856e4b5d7
+size 960044

assets/samples/extra_noise/noise2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f29b0ac90468f2395f0d363d0c215be2dc928893bb84ca3cd3a0bc4be5bd4ace
+size 960044

assets/samples/input/Background.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f62505d5b6099cf35827a3e8f9c1cd08bbadc8a7623a361bf72839dc7db7225
+size 3973782

assets/samples/input/Distortion.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:854895fc4f3331331954f569c95540cae0a344a10b82d83fd6fd7dafbdcd2fa1
+size 4387734

assets/samples/input/Music.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e331b37d45931ecf81e229e4b9449cdffb38e8a39f2d385bc3a60d894006bae6
+size 3439410

assets/samples/input/Reverb.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac834e42c47ce78b15c649e175d71eea3a1da4a4a22ad2abed5307c663e99133
+size 4064102

assets/samples/input/Wind.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e18809e079691fdea30e381f1b881a30c879ccff3556f0dc79b42ecc8083eb36
+size 3424358

audio_tools.py ADDED Viewed

	@@ -0,0 +1,105 @@

+from typing import Optional
+import numpy as np
+import librosa
+from PIL import Image
+import io
+import matplotlib.pyplot as plt
+import soundfile as sf
+def spec_image(
+    audio_wav: str,
+    n_fft: int = 2048,
+    hop_length: int = 512,
+    n_mels: int = 128,
+    fmax: Optional[float] = None,
+) -> Image.Image:
+    """
+    Generate a mel-spectrogram image from an audio file.
+    """
+    y, sr = librosa.load(audio_wav, mono=True, sr=None)
+    S = librosa.feature.melspectrogram(
+        y=y,
+        sr=sr,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        n_mels=n_mels,
+        fmax=fmax or sr // 2,
+    )
+    S_db = librosa.power_to_db(S, ref=np.max(S))
+    fig, ax = plt.subplots(figsize=(8, 3), dpi=150)
+    img = librosa.display.specshow(
+        S_db, sr=sr, hop_length=hop_length, x_axis="time", y_axis="mel", ax=ax
+    )
+    cbar = fig.colorbar(img, ax=ax, format="%+2.0f dB")
+    cbar.set_label("dB")
+    ax.set_title("Mel-spectrogram")
+    ax.set_xlabel("Time in s")
+    ax.set_ylabel("Frequency in Hz")
+    fig.tight_layout(pad=0.2)
+    buf = io.BytesIO()
+    fig.savefig(buf, format="png", bbox_inches="tight", pad_inches=0)
+    plt.close(fig)
+    buf.seek(0)
+    return Image.open(buf).convert("RGB")
+def mix_at_snr(
+    signal_path: str,
+    noise_path: str,
+    output_path: str = "output.wav",
+    snr_db: float = 10.0,
+    rng: Optional[np.random.Generator] = None,
+) -> bool:
+    """
+    Mix noise into clean audio at a target SNR (in dB).
+    Args:
+        signal_wav: Path to clean/foreground audio (wav/mp3).
+        noise_wav: Path to noise audio (wav/mp3).
+        snr_db: Desired SNR in dB (signal/noise).
+        normalize: If True, peak-normalize the mixture to |x|max=1 after mixing
+                   (note: can slightly alter achieved SNR).
+        rng: Optional numpy Generator for reproducible random cropping.
+    Returns:
+        Bool wether clipping occurred.
+    """
+    clipped = False
+    rng = rng or np.random.default_rng()
+    sig, sr_s = librosa.load(signal_path, mono=True, sr=None)
+    noise, sr_n = librosa.load(noise_path, mono=True, sr=None)
+    # Resample noise if needed
+    if sr_s != sr_n:
+        noise = librosa.resample(noise, orig_sr=sr_n, target_sr=sr_s, res_type="kaiser_best")
+    # Match lengths:
+    L = len(sig)
+    if len(noise) < L:
+        reps = int(np.ceil(L / len(noise)))
+        noise = np.tile(noise, reps)[:L]
+    else:
+        start = rng.integers(0, len(noise) - L + 1) if len(noise) > L else 0
+        noise = noise[start : start + L]
+    sig_power = float(np.mean(sig**2))
+    noise_power = float(np.mean(noise**2))
+    if sig_power == 0.0:
+        out = noise * 0.0
+    elif noise_power == 0.0:
+        out = sig.copy()
+    else:
+        target_noise_power = sig_power / (10.0 ** (snr_db / 10.0))
+        scale = np.sqrt(target_noise_power / noise_power)
+        noise_scaled = noise * scale
+        out = sig + noise_scaled
+    peak = np.max(np.abs(out)) or 1.0
+    if peak > 1.0:
+        clipped = True
+        out = out / peak
+    sf.write(output_path, out, sr_s)
+    return clipped

constants.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from typing import Final
+ENHANCEMENT_MODELS: Final = ["FINCH", "LARK_V2", "QUAIL"]
+API_V2_URL: Final = "https://api.ai-coustics.io/v2"
+CHUNK_SIZE: Final = 1024
+TIMEOUT_FACTOR_MB: Final = 60
+BASE_TIMEOUT_SECONDS: Final = 120
+MINUTES_KEEP: Final = 60
+NOISES: Final = {
+    "None": "None",
+    "Noise_0": "assets/samples/extra_noise/noise0.wav",
+    "Noise_1": "assets/samples/extra_noise/noise1.wav",
+    "Noise_2": "assets/samples/extra_noise/noise2.wav",
+}
+NOISE_TYPES: Final = list(NOISES.keys())
+SNR_LEVELS: Final = ["None", "-5", "0", "10", "20"]
+EXAMPLES: Final = ["Background", "Reverb", "Distortion", "Wind", "Music"]
+EXAMPLES_DIR: Final = [
+    [
+        "assets/samples/input/Background.wav",
+        100,
+        "FINCH",
+    ],
+    [
+        "assets/samples/input/Reverb.wav",
+        100,
+        "QUAIL",
+    ],
+    [
+        "assets/samples/input/Distortion.wav",
+        100,
+        "LARK_V2",
+    ],
+    [
+        "assets/samples/input/Wind.wav",
+        100,
+        "LARK_V2",
+    ],
+    [
+        "assets/samples/input/Music.wav",
+        100,
+        "LARK_V2",
+    ],
+]

intro.md ADDED Viewed

	@@ -0,0 +1,7 @@

+Welcome! This interactive demo allows you to denoise and enhance audio files using AI-Coustics models.
+Learn more about our technology and its capabilities at [AI-Coustics](https://ai-coustics.com/).
+**How to Use:**
+Upload or record a (noisy) speech sample to enhance its quality. You can optionally add background noise to the input. To generate new enhanced audio, you'll need an API key. Alternatively, you can preview some enhancement results instantly by selecting one of the preprocessed examples below — no API key required.

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+# Requires Python 3.11
+matplotlib>=3.8,<3.10
+soundfile>=0.12.1
+aiohttp>=3.9,<4
+librosa>=0.10.1,<0.11
+loguru~=0.7
+aic-sdk
+dotenv
+resampy