Xin Zhang commited on
Commit
11f0e65
·
1 Parent(s): 23ac623

[fix]: update.

Browse files
config.py CHANGED
@@ -73,3 +73,4 @@ LLM_SYS_PROMPT_EN = """
73
  """
74
 
75
  hotwords_file = MODEL_DIR / 'hotwords.txt'
 
 
73
  """
74
 
75
  hotwords_file = MODEL_DIR / 'hotwords.txt'
76
+ hotwords_json = MODEL_DIR / 'hotwords.json'
moyoyo_asr_models/hotwords.json CHANGED
@@ -1,3 +1,7 @@
1
  {
2
- "勾色母": "GOSIM"
 
 
 
 
3
  }
 
1
  {
2
+ "高斯姆": "GOSIM",
3
+ "GO SIM": "GOSIM",
4
+ "go sim": "GOSIM",
5
+ "GO SAME": "GOSIM",
6
+ "go same": "GOSIM"
7
  }
moyoyo_asr_models/hotwords.txt CHANGED
@@ -10,7 +10,6 @@ lib
10
  HUAWEI
11
  Future为
12
  Cloud
13
- NVIDIA
14
  OpenAI
15
  PYTHON
16
  八币
 
10
  HUAWEI
11
  Future为
12
  Cloud
 
13
  OpenAI
14
  PYTHON
15
  八币
transcribe/helpers/funasr.py CHANGED
@@ -21,7 +21,7 @@ class FunASR:
21
  def warmup(self, warmup_steps=1):
22
  warmup_soundfile = f"{config.ASSERT_DIR}/jfk.flac"
23
  for _ in range(warmup_steps):
24
- self.model.generate(input=warmup_soundfile, disable_pbar=True, hotword=config.hotwords_file.as_posix())
25
 
26
  def transcribe(self, audio_buffer: bytes, language):
27
  audio_frames = np.frombuffer(audio_buffer, dtype=np.float32)
 
21
  def warmup(self, warmup_steps=1):
22
  warmup_soundfile = f"{config.ASSERT_DIR}/jfk.flac"
23
  for _ in range(warmup_steps):
24
+ self.model.generate(input=warmup_soundfile, disable_pbar=True)
25
 
26
  def transcribe(self, audio_buffer: bytes, language):
27
  audio_frames = np.frombuffer(audio_buffer, dtype=np.float32)
transcribe/utils.py CHANGED
@@ -8,6 +8,7 @@ import config
8
  import csv
9
  import av
10
  import re
 
11
 
12
  # Compile regex patterns once outside the loop for better performance
13
  p_pattern = re.compile(r"(\s*\[.*?\])")
@@ -18,43 +19,79 @@ p_end_pattern = re.compile(r"(\s*.*\])")
18
  def filter_words(res_word):
19
  """
20
  Filter words according to specific bracket patterns.
21
-
22
  Args:
23
  res_word: Iterable of word objects with a 'text' attribute
24
-
25
  Returns:
26
  List of filtered word objects
27
  """
28
  asr_results = []
29
  skip_word = False
30
-
31
  for word in res_word:
32
  # Skip words that completely match the pattern
33
  if p_pattern.match(word.text):
34
  continue
35
-
36
  # Mark the start of a section to skip
37
  if p_start_pattern.match(word.text):
38
  skip_word = True
39
  continue
40
-
41
  # Mark the end of a section to skip
42
  if p_end_pattern.match(word.text) and skip_word:
43
  skip_word = False
44
  continue
45
-
46
  # Skip words if we're in a skip section
47
  if skip_word:
48
  continue
49
-
 
 
50
  # Add the word to results if it passed all filters
51
  asr_results.append(word)
52
-
53
  return asr_results
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  def log_block(key: str, value, unit=''):
56
  if config.DEBUG:
57
- return
58
  """格式化输出日志内容"""
59
  key_fmt = f"[ {key.ljust(25)}]" # 左对齐填充
60
  val_fmt = f"{value} {unit}".strip()
@@ -171,4 +208,4 @@ class TestDataWriter:
171
  def write(self, result: 'DebugResult'):
172
  with open(self.file_path, mode='a', newline='') as file:
173
  writer = csv.DictWriter(file, fieldnames=self.fieldnames)
174
- writer.writerow(result.model_dump(by_alias=True))
 
8
  import csv
9
  import av
10
  import re
11
+ import json
12
 
13
  # Compile regex patterns once outside the loop for better performance
14
  p_pattern = re.compile(r"(\s*\[.*?\])")
 
19
  def filter_words(res_word):
20
  """
21
  Filter words according to specific bracket patterns.
22
+
23
  Args:
24
  res_word: Iterable of word objects with a 'text' attribute
25
+
26
  Returns:
27
  List of filtered word objects
28
  """
29
  asr_results = []
30
  skip_word = False
31
+
32
  for word in res_word:
33
  # Skip words that completely match the pattern
34
  if p_pattern.match(word.text):
35
  continue
36
+
37
  # Mark the start of a section to skip
38
  if p_start_pattern.match(word.text):
39
  skip_word = True
40
  continue
41
+
42
  # Mark the end of a section to skip
43
  if p_end_pattern.match(word.text) and skip_word:
44
  skip_word = False
45
  continue
46
+
47
  # Skip words if we're in a skip section
48
  if skip_word:
49
  continue
50
+
51
+ word.text = replace_hotwords(word.text)
52
+
53
  # Add the word to results if it passed all filters
54
  asr_results.append(word)
55
+
56
  return asr_results
57
 
58
+
59
+
60
+ def replace_hotwords(text: str) -> str:
61
+ """
62
+ Reads hotwords from a JSON file and replaces occurrences in the input text.
63
+
64
+ Args:
65
+ text: The input string to process.
66
+
67
+ Returns:
68
+ The string with hotwords replaced.
69
+ """
70
+ # Using the provided absolute path for simplicity in this example
71
+ hotwords_path = config.hotwords_file.as_posix()
72
+
73
+ try:
74
+ with open(hotwords_path, 'r', encoding='utf-8') as f:
75
+ hotwords = json.load(f)
76
+ except FileNotFoundError:
77
+ print(f"Error: hotwords.json not found at {hotwords_path}")
78
+ return text
79
+ except json.JSONDecodeError:
80
+ print(f"Error: Could not decode JSON from {hotwords_path}")
81
+ return text
82
+
83
+ processed_text = text
84
+ # Iterate through the hotwords dictionary
85
+ for key, value in hotwords.items():
86
+ # Replace all occurrences of the key with the value in the text
87
+ processed_text = processed_text.replace(key, value)
88
+
89
+ return processed_text
90
+
91
+
92
  def log_block(key: str, value, unit=''):
93
  if config.DEBUG:
94
+ return
95
  """格式化输出日志内容"""
96
  key_fmt = f"[ {key.ljust(25)}]" # 左对齐填充
97
  val_fmt = f"{value} {unit}".strip()
 
208
  def write(self, result: 'DebugResult'):
209
  with open(self.file_path, mode='a', newline='') as file:
210
  writer = csv.DictWriter(file, fieldnames=self.fieldnames)
211
+ writer.writerow(result.model_dump(by_alias=True))