DragonLineageAI
/

Vi-SparkTTS-0.5B

@@ -1,151 +0,0 @@
-# Copyright (c) 2025 SparkAudio & The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Utility functions for SparkTTS """
-import random
-import soxr
-import soundfile
-import torch
-import torchaudio
-import numpy as np
-from pathlib import Path
-from typing import Tuple, Dict, Any
-from numpy.lib.stride_tricks import sliding_window_view
-from omegaconf import OmegaConf # Keep if BiCodec config loading needs it
-# --- Token Maps (from sparktts/utils/token_parser.py) ---
-TASK_TOKEN_MAP = {
-    "vc": "<|task_vc|>",
-    "tts": "<|task_tts|>",
-    "asr": "<|task_asr|>",
-    "s2s": "<|task_s2s|>",
-    "t2s": "<|task_t2s|>",
-    "understand": "<|task_understand|>",
-    "caption": "<|task_cap|>",
-    "controllable_tts": "<|task_controllable_tts|>",
-    "prompt_tts": "<|task_prompt_tts|>",
-    "speech_edit": "<|task_edit|>",
-}
-LEVELS_MAP = {
-    "very_low": 0,
-    "low": 1,
-    "moderate": 2,
-    "high": 3,
-    "very_high": 4,
-}
-LEVELS_MAP_UI = {
-    1: 'very_low',
-    2: 'low',
-    3: 'moderate',
-    4: 'high',
-    5: 'very_high'
-}
-GENDER_MAP = {
-    "female": 0,
-    "male": 1,
-}
-# --- Audio Utils (from sparktts/utils/audio.py) ---
-def audio_volume_normalize(audio: np.ndarray, coeff: float = 0.2) -> np.ndarray:
-    temp = np.sort(np.abs(audio))
-    if len(temp) == 0: # Handle empty audio case
-        return audio
-    if temp[-1] < 0.1:
-        scaling_factor = max(temp[-1], 1e-3)
-        audio = audio / scaling_factor * 0.1
-    temp = temp[temp > 0.01]
-    L = temp.shape[0]
-    if L <= 10:
-        return audio
-    volume = np.mean(temp[int(0.9 * L) : int(0.99 * L)])
-    if volume == 0: # Avoid division by zero if volume is effectively zero
-        return audio
-    audio = audio * np.clip(coeff / volume, a_min=0.1, a_max=10)
-    max_value = np.max(np.abs(audio)) if len(audio) > 0 else 0
-    if max_value > 1:
-        audio = audio / max_value
-    return audio
-def load_audio(
-    adfile: Path,
-    sampling_rate: int = None,
-    length: int = None,
-    volume_normalize: bool = False,
-    segment_duration: int = None,
-) -> np.ndarray:
-    try:
-        audio, sr = soundfile.read(adfile, dtype='float32') # Ensure float32
-    except Exception as e:
-        raise IOError(f"Could not read audio file {adfile}: {e}")
-    if audio is None or len(audio) == 0:
-        raise ValueError(f"Audio file {adfile} is empty or invalid.")
-    if len(audio.shape) > 1:
-        audio = audio[:, 0]
-    if sampling_rate is not None and sr != sampling_rate:
-        try:
-            # Ensure input is float64 for soxr
-            audio = audio.astype(np.float64)
-            audio = soxr.resample(audio, sr, sampling_rate, quality="VHQ")
-            # Convert back to float32
-            audio = audio.astype(np.float32)
-            sr = sampling_rate
-        except Exception as e:
-             raise RuntimeError(f"Failed to resample audio from {sr}Hz to {sampling_rate}Hz: {e}")
-    if segment_duration is not None:
-        seg_length = int(sr * segment_duration)
-        audio = random_select_audio_segment(audio, seg_length)
-    if volume_normalize:
-        audio = audio_volume_normalize(audio)
-    if length is not None:
-        if audio.shape[0] > length:
-            audio = audio[:length]
-        else:
-            audio = np.pad(audio, (0, int(length - audio.shape[0])), mode='constant')
-    return audio
-def random_select_audio_segment(audio: np.ndarray, length: int) -> np.ndarray:
-    if audio.shape[0] < length:
-        audio = np.pad(audio, (0, int(length - audio.shape[0])), mode='constant')
-        start_index = 0 # If padded, start from beginning
-    elif audio.shape[0] == length:
-         start_index = 0 # If exact length, start from beginning
-    else:
-        start_index = random.randint(0, audio.shape[0] - length)
-    end_index = int(start_index + length)
-    return audio[start_index:end_index]
-# --- File Utils (Minimal required) ---
-def load_config_yaml(config_path: Path) -> Dict:
-    """Loads a YAML configuration file using OmegaConf."""
-    # Check if path exists
-    if not Path(config_path).is_file():
-        raise FileNotFoundError(f"YAML Config file not found: {config_path}")
-    try:
-        config = OmegaConf.load(config_path)
-        # Convert OmegaConf DictConfig to standard Python dict
-        return OmegaConf.to_container(config, resolve=True)
-    except Exception as e:
-        raise IOError(f"Error loading YAML config file {config_path}: {e}")