Xin Zhang
commited on
Commit
·
11f0e65
1
Parent(s):
23ac623
[fix]: update.
Browse files- config.py +1 -0
- moyoyo_asr_models/hotwords.json +5 -1
- moyoyo_asr_models/hotwords.txt +0 -1
- transcribe/helpers/funasr.py +1 -1
- transcribe/utils.py +47 -10
config.py
CHANGED
|
@@ -73,3 +73,4 @@ LLM_SYS_PROMPT_EN = """
|
|
| 73 |
"""
|
| 74 |
|
| 75 |
hotwords_file = MODEL_DIR / 'hotwords.txt'
|
|
|
|
|
|
| 73 |
"""
|
| 74 |
|
| 75 |
hotwords_file = MODEL_DIR / 'hotwords.txt'
|
| 76 |
+
hotwords_json = MODEL_DIR / 'hotwords.json'
|
moyoyo_asr_models/hotwords.json
CHANGED
|
@@ -1,3 +1,7 @@
|
|
| 1 |
{
|
| 2 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"高斯姆": "GOSIM",
|
| 3 |
+
"GO SIM": "GOSIM",
|
| 4 |
+
"go sim": "GOSIM",
|
| 5 |
+
"GO SAME": "GOSIM",
|
| 6 |
+
"go same": "GOSIM"
|
| 7 |
}
|
moyoyo_asr_models/hotwords.txt
CHANGED
|
@@ -10,7 +10,6 @@ lib
|
|
| 10 |
HUAWEI
|
| 11 |
Future为
|
| 12 |
Cloud
|
| 13 |
-
NVIDIA
|
| 14 |
OpenAI
|
| 15 |
PYTHON
|
| 16 |
八币
|
|
|
|
| 10 |
HUAWEI
|
| 11 |
Future为
|
| 12 |
Cloud
|
|
|
|
| 13 |
OpenAI
|
| 14 |
PYTHON
|
| 15 |
八币
|
transcribe/helpers/funasr.py
CHANGED
|
@@ -21,7 +21,7 @@ class FunASR:
|
|
| 21 |
def warmup(self, warmup_steps=1):
|
| 22 |
warmup_soundfile = f"{config.ASSERT_DIR}/jfk.flac"
|
| 23 |
for _ in range(warmup_steps):
|
| 24 |
-
self.model.generate(input=warmup_soundfile, disable_pbar=True
|
| 25 |
|
| 26 |
def transcribe(self, audio_buffer: bytes, language):
|
| 27 |
audio_frames = np.frombuffer(audio_buffer, dtype=np.float32)
|
|
|
|
| 21 |
def warmup(self, warmup_steps=1):
|
| 22 |
warmup_soundfile = f"{config.ASSERT_DIR}/jfk.flac"
|
| 23 |
for _ in range(warmup_steps):
|
| 24 |
+
self.model.generate(input=warmup_soundfile, disable_pbar=True)
|
| 25 |
|
| 26 |
def transcribe(self, audio_buffer: bytes, language):
|
| 27 |
audio_frames = np.frombuffer(audio_buffer, dtype=np.float32)
|
transcribe/utils.py
CHANGED
|
@@ -8,6 +8,7 @@ import config
|
|
| 8 |
import csv
|
| 9 |
import av
|
| 10 |
import re
|
|
|
|
| 11 |
|
| 12 |
# Compile regex patterns once outside the loop for better performance
|
| 13 |
p_pattern = re.compile(r"(\s*\[.*?\])")
|
|
@@ -18,43 +19,79 @@ p_end_pattern = re.compile(r"(\s*.*\])")
|
|
| 18 |
def filter_words(res_word):
|
| 19 |
"""
|
| 20 |
Filter words according to specific bracket patterns.
|
| 21 |
-
|
| 22 |
Args:
|
| 23 |
res_word: Iterable of word objects with a 'text' attribute
|
| 24 |
-
|
| 25 |
Returns:
|
| 26 |
List of filtered word objects
|
| 27 |
"""
|
| 28 |
asr_results = []
|
| 29 |
skip_word = False
|
| 30 |
-
|
| 31 |
for word in res_word:
|
| 32 |
# Skip words that completely match the pattern
|
| 33 |
if p_pattern.match(word.text):
|
| 34 |
continue
|
| 35 |
-
|
| 36 |
# Mark the start of a section to skip
|
| 37 |
if p_start_pattern.match(word.text):
|
| 38 |
skip_word = True
|
| 39 |
continue
|
| 40 |
-
|
| 41 |
# Mark the end of a section to skip
|
| 42 |
if p_end_pattern.match(word.text) and skip_word:
|
| 43 |
skip_word = False
|
| 44 |
continue
|
| 45 |
-
|
| 46 |
# Skip words if we're in a skip section
|
| 47 |
if skip_word:
|
| 48 |
continue
|
| 49 |
-
|
|
|
|
|
|
|
| 50 |
# Add the word to results if it passed all filters
|
| 51 |
asr_results.append(word)
|
| 52 |
-
|
| 53 |
return asr_results
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
def log_block(key: str, value, unit=''):
|
| 56 |
if config.DEBUG:
|
| 57 |
-
return
|
| 58 |
"""格式化输出日志内容"""
|
| 59 |
key_fmt = f"[ {key.ljust(25)}]" # 左对齐填充
|
| 60 |
val_fmt = f"{value} {unit}".strip()
|
|
@@ -171,4 +208,4 @@ class TestDataWriter:
|
|
| 171 |
def write(self, result: 'DebugResult'):
|
| 172 |
with open(self.file_path, mode='a', newline='') as file:
|
| 173 |
writer = csv.DictWriter(file, fieldnames=self.fieldnames)
|
| 174 |
-
writer.writerow(result.model_dump(by_alias=True))
|
|
|
|
| 8 |
import csv
|
| 9 |
import av
|
| 10 |
import re
|
| 11 |
+
import json
|
| 12 |
|
| 13 |
# Compile regex patterns once outside the loop for better performance
|
| 14 |
p_pattern = re.compile(r"(\s*\[.*?\])")
|
|
|
|
| 19 |
def filter_words(res_word):
|
| 20 |
"""
|
| 21 |
Filter words according to specific bracket patterns.
|
| 22 |
+
|
| 23 |
Args:
|
| 24 |
res_word: Iterable of word objects with a 'text' attribute
|
| 25 |
+
|
| 26 |
Returns:
|
| 27 |
List of filtered word objects
|
| 28 |
"""
|
| 29 |
asr_results = []
|
| 30 |
skip_word = False
|
| 31 |
+
|
| 32 |
for word in res_word:
|
| 33 |
# Skip words that completely match the pattern
|
| 34 |
if p_pattern.match(word.text):
|
| 35 |
continue
|
| 36 |
+
|
| 37 |
# Mark the start of a section to skip
|
| 38 |
if p_start_pattern.match(word.text):
|
| 39 |
skip_word = True
|
| 40 |
continue
|
| 41 |
+
|
| 42 |
# Mark the end of a section to skip
|
| 43 |
if p_end_pattern.match(word.text) and skip_word:
|
| 44 |
skip_word = False
|
| 45 |
continue
|
| 46 |
+
|
| 47 |
# Skip words if we're in a skip section
|
| 48 |
if skip_word:
|
| 49 |
continue
|
| 50 |
+
|
| 51 |
+
word.text = replace_hotwords(word.text)
|
| 52 |
+
|
| 53 |
# Add the word to results if it passed all filters
|
| 54 |
asr_results.append(word)
|
| 55 |
+
|
| 56 |
return asr_results
|
| 57 |
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def replace_hotwords(text: str) -> str:
|
| 61 |
+
"""
|
| 62 |
+
Reads hotwords from a JSON file and replaces occurrences in the input text.
|
| 63 |
+
|
| 64 |
+
Args:
|
| 65 |
+
text: The input string to process.
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
The string with hotwords replaced.
|
| 69 |
+
"""
|
| 70 |
+
# Using the provided absolute path for simplicity in this example
|
| 71 |
+
hotwords_path = config.hotwords_file.as_posix()
|
| 72 |
+
|
| 73 |
+
try:
|
| 74 |
+
with open(hotwords_path, 'r', encoding='utf-8') as f:
|
| 75 |
+
hotwords = json.load(f)
|
| 76 |
+
except FileNotFoundError:
|
| 77 |
+
print(f"Error: hotwords.json not found at {hotwords_path}")
|
| 78 |
+
return text
|
| 79 |
+
except json.JSONDecodeError:
|
| 80 |
+
print(f"Error: Could not decode JSON from {hotwords_path}")
|
| 81 |
+
return text
|
| 82 |
+
|
| 83 |
+
processed_text = text
|
| 84 |
+
# Iterate through the hotwords dictionary
|
| 85 |
+
for key, value in hotwords.items():
|
| 86 |
+
# Replace all occurrences of the key with the value in the text
|
| 87 |
+
processed_text = processed_text.replace(key, value)
|
| 88 |
+
|
| 89 |
+
return processed_text
|
| 90 |
+
|
| 91 |
+
|
| 92 |
def log_block(key: str, value, unit=''):
|
| 93 |
if config.DEBUG:
|
| 94 |
+
return
|
| 95 |
"""格式化输出日志内容"""
|
| 96 |
key_fmt = f"[ {key.ljust(25)}]" # 左对齐填充
|
| 97 |
val_fmt = f"{value} {unit}".strip()
|
|
|
|
| 208 |
def write(self, result: 'DebugResult'):
|
| 209 |
with open(self.file_path, mode='a', newline='') as file:
|
| 210 |
writer = csv.DictWriter(file, fieldnames=self.fieldnames)
|
| 211 |
+
writer.writerow(result.model_dump(by_alias=True))
|