h0witended commited on
Commit
2afcced
·
verified ·
1 Parent(s): 6f9ddf4

Delete utils.py

Browse files
Files changed (1) hide show
  1. utils.py +0 -203
utils.py DELETED
@@ -1,203 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2025 The OpenBMB Team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
-
16
- import logging
17
- import re
18
-
19
- import librosa
20
- import numpy as np
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- def is_silent(data):
26
- if np.abs(data).max() < 3e-3:
27
- return True
28
- else:
29
- return False
30
-
31
-
32
- def sentence_end(txt):
33
- for c in [".", "。", "!", "?", "!", "?"]:
34
- if c in txt:
35
- if c == ".": # check not number before it like 1.
36
- idx = txt.find(c)
37
- if idx > 0:
38
- if txt[idx - 1].isdigit():
39
- continue
40
- return c
41
- return ""
42
-
43
-
44
- class NumberToTextConverter:
45
- r"""
46
- A helper class to ensure text-to-speech (TTS) systems read numeric digits
47
- in the desired language (Chinese or English) digit-by-digit. It forcibly
48
- replaces all numeric substrings in text with their language-specific
49
- textual representations, thereby reducing the likelihood of TTS mistakes
50
- on numbers.
51
- Note: MiniCPM-o 2.6 only use this in streaming mode.
52
-
53
- Attributes:
54
- num_to_chinese (dict):
55
- Mapping from digit (str) to its Chinese textual form (str).
56
- num_to_english (dict):
57
- Mapping from digit (str) to its English textual form (str).
58
-
59
- Example:
60
- >>> converter = NumberToTextConverter()
61
- >>> converter.replace_numbers_with_text("我有2个苹果", language="chinese")
62
- '我有两个苹果'
63
- >>> converter.replace_numbers_with_text("I have 23 books", language="english")
64
- 'I have two three books'
65
- """
66
-
67
- def __init__(self):
68
- self.num_to_chinese = {
69
- "0": "零",
70
- "1": "一",
71
- "2": "二",
72
- "3": "三",
73
- "4": "四",
74
- "5": "五",
75
- "6": "六",
76
- "7": "七",
77
- "8": "八",
78
- "9": "九",
79
- }
80
- self.num_to_english = {
81
- "0": "zero",
82
- "1": "one",
83
- "2": "two",
84
- "3": "three",
85
- "4": "four",
86
- "5": "five",
87
- "6": "six",
88
- "7": "seven",
89
- "8": "eight",
90
- "9": "nine",
91
- }
92
-
93
- def number_to_chinese_digit_by_digit(self, num_str):
94
- result = ""
95
- for char in num_str:
96
- if char in self.num_to_chinese:
97
- result += self.num_to_chinese[char]
98
- return result
99
-
100
- def number_to_english_digit_by_digit(self, num_str):
101
- result = []
102
- for char in num_str:
103
- if char in self.num_to_english:
104
- result.append(self.num_to_english[char])
105
- return " ".join(result)
106
-
107
- def detect_language(self, text):
108
- chinese_count = len(re.findall(r"[\u4e00-\u9fff]", text))
109
- english_count = len(re.findall(r"[a-zA-Z]", text))
110
- return "chinese" if chinese_count >= english_count else "english"
111
-
112
- def replace_numbers_with_text(self, text, language=None):
113
- if language is None:
114
- language = self.detect_language(text)
115
- numbers = re.findall(r"\d+", text)
116
-
117
- for num in numbers:
118
- if language == "chinese":
119
- replacement = self.number_to_chinese_digit_by_digit(num)
120
- else:
121
- replacement = self.number_to_english_digit_by_digit(num)
122
- text = text.replace(num, replacement, 1)
123
-
124
- return text
125
-
126
-
127
- class VoiceChecker:
128
- r"""
129
- A simple utility class to detect silence or low variation in consecutive audio chunks by comparing
130
- the mel-spectrogram distances. It keeps track of consecutive zero-distance and low-distance chunks
131
- to decide if the audio is considered "bad" (e.g., overly silent or not changing enough).
132
-
133
- Attributes:
134
- previous_mel (`np.ndarray` or `None`):
135
- Holds the previously observed mel-spectrogram in decibel scale. Used to compute
136
- the next distance; reset via :meth:`reset`.
137
- consecutive_zeros (`int`):
138
- The number of consecutive chunks that were detected as silent (distance = 0).
139
- consecutive_low_distance (`int`):
140
- The number of consecutive chunks whose distance was below the threshold.
141
-
142
- Example:
143
- >>> checker = VoiceChecker()
144
- >>> # Suppose we have audio_wav (list or np.ndarray) and mel_spec (np.ndarray)
145
- >>> # We split them into chunks and call checker.is_bad(...)
146
- >>> is_audio_bad = checker.is_bad(audio_wav, mel_spec, chunk_size=2560, thresh=100.0)
147
- >>> if is_audio_bad:
148
- ... print("Audio deemed bad!")
149
- >>> # Reset states if needed
150
- >>> checker.reset()
151
- """
152
-
153
- def __init__(self):
154
- self.previous_mel = None
155
- self.consecutive_zeros = 0
156
- self.consecutive_low_distance = 0
157
-
158
- def compute_distance(self, audio_chunk, mel_spec):
159
- if is_silent(audio_chunk):
160
- return 0.0 # 检查是否为空白片段
161
-
162
- mel_db = librosa.power_to_db(mel_spec)
163
- if self.previous_mel is None:
164
- self.previous_mel = mel_db
165
- return -1.0
166
-
167
- distance = np.linalg.norm(np.mean(mel_db, axis=1) - np.mean(self.previous_mel, axis=1))
168
- self.previous_mel = mel_db
169
- return distance
170
-
171
- def is_bad(self, audio_wav, mel_spec, chunk_size=2560, thresh=100.0):
172
- num_chunks = len(audio_wav) // chunk_size
173
- mel_chunk_size = mel_spec.shape[-1] // num_chunks
174
- for i in range(num_chunks):
175
- audio_chunk = audio_wav[i * chunk_size : (i + 1) * chunk_size]
176
- mel_spec_chunk = mel_spec[:, i * mel_chunk_size : (i + 1) * mel_chunk_size]
177
-
178
- distance = self.compute_distance(audio_chunk, mel_spec_chunk)
179
- logger.warning(
180
- f"mel dist: {distance:.1f}, zero: {self.consecutive_zeros}, low: {self.consecutive_low_distance}"
181
- )
182
- if distance == 0:
183
- self.consecutive_low_distance = 0 # reset
184
- self.consecutive_zeros += 1
185
- if self.consecutive_zeros >= 12:
186
- logger.warning("VoiceChecker detected 1.2 s silent. Marking as failed.")
187
- return True
188
- elif distance < thresh:
189
- self.consecutive_zeros = 0
190
- self.consecutive_low_distance += 1
191
- if self.consecutive_low_distance >= 5:
192
- logger.warning("VoiceChecker detected 5 consecutive low distance chunks. Marking as failed.")
193
- return True
194
- else:
195
- self.consecutive_low_distance = 0
196
- self.consecutive_zeros = 0
197
-
198
- return False
199
-
200
- def reset(self):
201
- self.previous_mel = None
202
- self.consecutive_zeros = 0
203
- self.consecutive_low_distance = 0