h0witended
/

tiny-random-MiniCPM-o-2_6

@@ -1,203 +0,0 @@
-# coding=utf-8
-# Copyright 2025 The OpenBMB Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import re
-import librosa
-import numpy as np
-logger = logging.getLogger(__name__)
-def is_silent(data):
-    if np.abs(data).max() < 3e-3:
-        return True
-    else:
-        return False
-def sentence_end(txt):
-    for c in [".", "。", "!", "?", "！", "？"]:
-        if c in txt:
-            if c == ".":  # check not number before it like 1.
-                idx = txt.find(c)
-                if idx > 0:
-                    if txt[idx - 1].isdigit():
-                        continue
-            return c
-    return ""
-class NumberToTextConverter:
-    r"""
-    A helper class to ensure text-to-speech (TTS) systems read numeric digits
-    in the desired language (Chinese or English) digit-by-digit. It forcibly
-    replaces all numeric substrings in text with their language-specific
-    textual representations, thereby reducing the likelihood of TTS mistakes
-    on numbers.
-    Note: MiniCPM-o 2.6 only use this in streaming mode.
-    Attributes:
-        num_to_chinese (dict):
-            Mapping from digit (str) to its Chinese textual form (str).
-        num_to_english (dict):
-            Mapping from digit (str) to its English textual form (str).
-    Example:
-        >>> converter = NumberToTextConverter()
-        >>> converter.replace_numbers_with_text("我有2个苹果", language="chinese")
-        '我有两个苹果'
-        >>> converter.replace_numbers_with_text("I have 23 books", language="english")
-        'I have two three books'
-    """
-    def __init__(self):
-        self.num_to_chinese = {
-            "0": "零",
-            "1": "一",
-            "2": "二",
-            "3": "三",
-            "4": "四",
-            "5": "五",
-            "6": "六",
-            "7": "七",
-            "8": "八",
-            "9": "九",
-        }
-        self.num_to_english = {
-            "0": "zero",
-            "1": "one",
-            "2": "two",
-            "3": "three",
-            "4": "four",
-            "5": "five",
-            "6": "six",
-            "7": "seven",
-            "8": "eight",
-            "9": "nine",
-        }
-    def number_to_chinese_digit_by_digit(self, num_str):
-        result = ""
-        for char in num_str:
-            if char in self.num_to_chinese:
-                result += self.num_to_chinese[char]
-        return result
-    def number_to_english_digit_by_digit(self, num_str):
-        result = []
-        for char in num_str:
-            if char in self.num_to_english:
-                result.append(self.num_to_english[char])
-        return " ".join(result)
-    def detect_language(self, text):
-        chinese_count = len(re.findall(r"[\u4e00-\u9fff]", text))
-        english_count = len(re.findall(r"[a-zA-Z]", text))
-        return "chinese" if chinese_count >= english_count else "english"
-    def replace_numbers_with_text(self, text, language=None):
-        if language is None:
-            language = self.detect_language(text)
-        numbers = re.findall(r"\d+", text)
-        for num in numbers:
-            if language == "chinese":
-                replacement = self.number_to_chinese_digit_by_digit(num)
-            else:
-                replacement = self.number_to_english_digit_by_digit(num)
-            text = text.replace(num, replacement, 1)
-        return text
-class VoiceChecker:
-    r"""
-    A simple utility class to detect silence or low variation in consecutive audio chunks by comparing
-    the mel-spectrogram distances. It keeps track of consecutive zero-distance and low-distance chunks
-    to decide if the audio is considered "bad" (e.g., overly silent or not changing enough).
-    Attributes:
-        previous_mel (`np.ndarray` or `None`):
-            Holds the previously observed mel-spectrogram in decibel scale. Used to compute
-            the next distance; reset via :meth:`reset`.
-        consecutive_zeros (`int`):
-            The number of consecutive chunks that were detected as silent (distance = 0).
-        consecutive_low_distance (`int`):
-            The number of consecutive chunks whose distance was below the threshold.
-    Example:
-        >>> checker = VoiceChecker()
-        >>> # Suppose we have audio_wav (list or np.ndarray) and mel_spec (np.ndarray)
-        >>> # We split them into chunks and call checker.is_bad(...)
-        >>> is_audio_bad = checker.is_bad(audio_wav, mel_spec, chunk_size=2560, thresh=100.0)
-        >>> if is_audio_bad:
-        ...     print("Audio deemed bad!")
-        >>> # Reset states if needed
-        >>> checker.reset()
-    """
-    def __init__(self):
-        self.previous_mel = None
-        self.consecutive_zeros = 0
-        self.consecutive_low_distance = 0
-    def compute_distance(self, audio_chunk, mel_spec):
-        if is_silent(audio_chunk):
-            return 0.0  # 检查是否为空白片段
-        mel_db = librosa.power_to_db(mel_spec)
-        if self.previous_mel is None:
-            self.previous_mel = mel_db
-            return -1.0
-        distance = np.linalg.norm(np.mean(mel_db, axis=1) - np.mean(self.previous_mel, axis=1))
-        self.previous_mel = mel_db
-        return distance
-    def is_bad(self, audio_wav, mel_spec, chunk_size=2560, thresh=100.0):
-        num_chunks = len(audio_wav) // chunk_size
-        mel_chunk_size = mel_spec.shape[-1] // num_chunks
-        for i in range(num_chunks):
-            audio_chunk = audio_wav[i * chunk_size : (i + 1) * chunk_size]
-            mel_spec_chunk = mel_spec[:, i * mel_chunk_size : (i + 1) * mel_chunk_size]
-            distance = self.compute_distance(audio_chunk, mel_spec_chunk)
-            logger.warning(
-                f"mel dist: {distance:.1f}, zero: {self.consecutive_zeros}, low: {self.consecutive_low_distance}"
-            )
-            if distance == 0:
-                self.consecutive_low_distance = 0  # reset
-                self.consecutive_zeros += 1
-                if self.consecutive_zeros >= 12:
-                    logger.warning("VoiceChecker detected 1.2 s silent. Marking as failed.")
-                    return True
-            elif distance < thresh:
-                self.consecutive_zeros = 0
-                self.consecutive_low_distance += 1
-                if self.consecutive_low_distance >= 5:
-                    logger.warning("VoiceChecker detected 5 consecutive low distance chunks. Marking as failed.")
-                    return True
-            else:
-                self.consecutive_low_distance = 0
-                self.consecutive_zeros = 0
-        return False
-    def reset(self):
-        self.previous_mel = None
-        self.consecutive_zeros = 0
-        self.consecutive_low_distance = 0