Spaces:

mason369
/

AI-RVC

Running

File size: 3,437 Bytes

b6f9c90

# -*- coding: utf-8 -*-
"""

音频处理模块 - 加载、保存和处理音频文件

"""
import numpy as np
import librosa
import soundfile as sf
from typing import Tuple, Optional


def load_audio(path: str, sr: int = 16000) -> np.ndarray:
    """

    加载音频文件并重采样



    Args:

        path: 音频文件路径

        sr: 目标采样率 (默认 16000)



    Returns:

        np.ndarray: 音频数据 (float32, 单声道)

    """
    audio, orig_sr = librosa.load(path, sr=None, mono=True)

    if orig_sr != sr:
        audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=sr)

    return audio.astype(np.float32)


def save_audio(path: str, audio: np.ndarray, sr: int = 48000):
    """

    保存音频到文件



    Args:

        path: 输出文件路径

        audio: 音频数据

        sr: 采样率 (默认 48000)

    """
    # 确保音频在 [-1, 1] 范围内
    audio = np.clip(audio, -1.0, 1.0)
    sf.write(path, audio, sr)


def soft_clip(
    audio: np.ndarray,
    threshold: float = 0.9,
    ceiling: float = 0.99,
) -> np.ndarray:
    """
    使用平滑软削波抑制峰值，尽量保留主体响度。

    Args:
        audio: 输入音频
        threshold: 开始压缩的阈值
        ceiling: 软削波上限

    Returns:
        np.ndarray: 处理后的音频
    """
    audio = np.asarray(audio, dtype=np.float32)

    if threshold <= 0:
        raise ValueError("threshold 必须大于 0")
    if ceiling <= threshold:
        raise ValueError("ceiling 必须大于 threshold")

    result = audio.copy()
    abs_audio = np.abs(result)
    mask = abs_audio > threshold
    if not np.any(mask):
        return result

    overshoot = (abs_audio[mask] - threshold) / (ceiling - threshold + 1e-8)
    compressed = threshold + (ceiling - threshold) * np.tanh(overshoot)
    result[mask] = np.sign(result[mask]) * compressed
    return result.astype(np.float32, copy=False)


def soft_clip_array(
    audio: np.ndarray,
    threshold: float = 0.9,
    ceiling: float = 0.99,
) -> np.ndarray:
    """软削波数组版本，支持单声道/多声道。"""
    return soft_clip(audio, threshold=threshold, ceiling=ceiling)


def get_audio_info(path: str) -> dict:
    """

    获取音频文件信息



    Args:

        path: 音频文件路径



    Returns:

        dict: 音频信息

    """
    info = sf.info(path)
    return {
        "duration": info.duration,
        "sample_rate": info.samplerate,
        "channels": info.channels,
        "format": info.format
    }


def normalize_audio(audio: np.ndarray, target_db: float = -20.0) -> np.ndarray:
    """

    音频响度归一化



    Args:

        audio: 输入音频

        target_db: 目标响度 (dB)



    Returns:

        np.ndarray: 归一化后的音频

    """
    rms = np.sqrt(np.mean(audio ** 2))
    if rms > 0:
        target_rms = 10 ** (target_db / 20)
        audio = audio * (target_rms / rms)
    return np.clip(audio, -1.0, 1.0)


def trim_silence(audio: np.ndarray, sr: int = 16000,

                 top_db: int = 30) -> np.ndarray:
    """

    去除音频首尾静音



    Args:

        audio: 输入音频

        sr: 采样率

        top_db: 静音阈值 (dB)



    Returns:

        np.ndarray: 去除静音后的音频

    """
    trimmed, _ = librosa.effects.trim(audio, top_db=top_db)
    return trimmed