bichnhan2701 commited on
Commit
f58f9b6
·
1 Parent(s): 2003560

Tempt remove preprocess

Browse files
Files changed (2) hide show
  1. app/core/asr_engine.py +68 -23
  2. app/jobs/transcribe_job.py +4 -13
app/core/asr_engine.py CHANGED
@@ -30,27 +30,75 @@ def _clean_transcript(text: str) -> str:
30
  return ""
31
 
32
  # 1. Remove excessive dots (more than 3 consecutive)
33
- text = re.sub(r'\.{4,}', '...', text)
34
 
35
  # 2. Remove repeated single words (e.g., "chuyền chuyền chuyền...")
36
- # Match word repeated 3+ times consecutively
37
  text = re.sub(r'\b(\w+)(\s+\1){2,}\b', r'\1', text, flags=re.IGNORECASE)
38
 
39
- # 3. Remove repeated short phrases (2-3 words repeated 3+ times)
40
- text = re.sub(r'((?:\S+\s+){1,3}?)\1{2,}', r'\1', text)
 
 
 
41
 
42
- # 4. Clean up multiple spaces
 
 
 
 
 
 
43
  text = re.sub(r'\s+', ' ', text)
44
 
45
- # 5. Clean up space before punctuation
46
  text = re.sub(r'\s+([.,!?])', r'\1', text)
47
 
48
- # 6. Remove trailing/leading dots and spaces
49
  text = text.strip(' .')
50
 
51
  return text
52
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def _deduplicate_chunks(prev_text: str, curr_text: str, overlap_words: int = 15) -> str:
55
  """
56
  Remove overlapping text between consecutive chunks.
@@ -141,7 +189,7 @@ def transcribe_file_unified(
141
  ) -> Tuple[str, List[Dict]]:
142
  """
143
  🔥 UNIFIED: Return both full transcript text AND timestamped chunks in ONE inference pass.
144
- This avoids the costly double-inference that was causing timeouts.
145
 
146
  Returns:
147
  (text, chunks) where chunks = [{"start": float, "end": float, "text": str}, ...]
@@ -152,27 +200,17 @@ def transcribe_file_unified(
152
  start_time = time.time()
153
  logger.info("[ASR] Starting unified transcription for %s", wav_path)
154
 
155
- # If audio is long, prefer chunked inference to avoid memory/time issues
156
  info = get_audio_info(wav_path) or {}
157
  duration = info.get("duration", 0)
158
  logger.info("[ASR] Audio duration: %.2fs", duration)
159
 
160
- if duration and duration > chunk_length_s:
161
- try:
162
- text, chunks = transcribe_long_audio(
163
- model, wav_path, chunk_length_s=chunk_length_s, overlap_s=stride_s
164
- )
165
- elapsed = time.time() - start_time
166
- logger.info("[ASR] Long audio transcription completed in %.2fs (%.2fx realtime)", elapsed, elapsed / duration if duration else 0)
167
- return text, chunks
168
- except Exception:
169
- logger.exception("transcribe_long_audio failed, falling back to pipeline")
170
-
171
- # Short audio: single pipeline call with timestamps
172
  out = model(
173
  wav_path,
174
  chunk_length_s=chunk_length_s,
175
- stride_length_s=stride_s,
176
  return_timestamps=True,
177
  )
178
 
@@ -187,8 +225,15 @@ def transcribe_file_unified(
187
  # Extract chunks with timestamps
188
  chunks = _extract_chunks_from_output(out)
189
 
 
 
 
 
 
 
190
  elapsed = time.time() - start_time
191
- logger.info("[ASR] Short audio transcription completed in %.2fs", elapsed)
 
192
 
193
  return text, chunks
194
 
 
30
  return ""
31
 
32
  # 1. Remove excessive dots (more than 3 consecutive)
33
+ text = re.sub(r'\.{4,}', '.', text)
34
 
35
  # 2. Remove repeated single words (e.g., "chuyền chuyền chuyền...")
36
+ # Match word repeated 2+ times consecutively
37
  text = re.sub(r'\b(\w+)(\s+\1){2,}\b', r'\1', text, flags=re.IGNORECASE)
38
 
39
+ # 3. Remove repeated short phrases (2-5 words repeated 2+ times)
40
+ # More aggressive pattern to catch "biết chính xác mình cần làm" repeats
41
+ for phrase_len in [5, 4, 3, 2]:
42
+ pattern = r'((?:\S+\s+){' + str(phrase_len) + r'})\1{1,}'
43
+ text = re.sub(pattern, r'\1', text)
44
 
45
+ # 4. Remove long repeated phrases (like "thế giới trên cầu" repeated many times)
46
+ # Find and remove sequences where same phrase appears 3+ times
47
+ words = text.split()
48
+ if len(words) > 10:
49
+ text = _remove_long_repeats(text)
50
+
51
+ # 5. Clean up multiple spaces
52
  text = re.sub(r'\s+', ' ', text)
53
 
54
+ # 6. Clean up space before punctuation
55
  text = re.sub(r'\s+([.,!?])', r'\1', text)
56
 
57
+ # 7. Remove trailing/leading dots and spaces
58
  text = text.strip(' .')
59
 
60
  return text
61
 
62
 
63
+ def _remove_long_repeats(text: str) -> str:
64
+ """
65
+ Remove long repeated phrases that regex can't easily catch.
66
+ Looks for phrases of 3-8 words that repeat consecutively.
67
+ """
68
+ words = text.split()
69
+ if len(words) < 10:
70
+ return text
71
+
72
+ result = []
73
+ i = 0
74
+
75
+ while i < len(words):
76
+ # Try to find repeating patterns of length 3-8 words
77
+ found_repeat = False
78
+ for phrase_len in range(8, 2, -1): # Check longer phrases first
79
+ if i + phrase_len * 2 > len(words):
80
+ continue
81
+
82
+ phrase = words[i:i+phrase_len]
83
+ next_phrase = words[i+phrase_len:i+phrase_len*2]
84
+
85
+ if phrase == next_phrase:
86
+ # Found a repeat, skip all consecutive repeats
87
+ result.extend(phrase)
88
+ j = i + phrase_len
89
+ while j + phrase_len <= len(words) and words[j:j+phrase_len] == phrase:
90
+ j += phrase_len
91
+ i = j
92
+ found_repeat = True
93
+ break
94
+
95
+ if not found_repeat:
96
+ result.append(words[i])
97
+ i += 1
98
+
99
+ return ' '.join(result)
100
+
101
+
102
  def _deduplicate_chunks(prev_text: str, curr_text: str, overlap_words: int = 15) -> str:
103
  """
104
  Remove overlapping text between consecutive chunks.
 
189
  ) -> Tuple[str, List[Dict]]:
190
  """
191
  🔥 UNIFIED: Return both full transcript text AND timestamped chunks in ONE inference pass.
192
+ Uses Whisper's built-in chunking mechanism instead of manual splitting to avoid hallucination.
193
 
194
  Returns:
195
  (text, chunks) where chunks = [{"start": float, "end": float, "text": str}, ...]
 
200
  start_time = time.time()
201
  logger.info("[ASR] Starting unified transcription for %s", wav_path)
202
 
 
203
  info = get_audio_info(wav_path) or {}
204
  duration = info.get("duration", 0)
205
  logger.info("[ASR] Audio duration: %.2fs", duration)
206
 
207
+ # 🔥 FIX: Always use single pipeline call with Whisper's built-in chunking
208
+ # Manual chunking causes text repetition and hallucination
209
+ # Whisper's internal chunking handles long audio properly
 
 
 
 
 
 
 
 
 
210
  out = model(
211
  wav_path,
212
  chunk_length_s=chunk_length_s,
213
+ stride_length_s=(chunk_length_s // 6, chunk_length_s // 6), # ~5s left/right context
214
  return_timestamps=True,
215
  )
216
 
 
225
  # Extract chunks with timestamps
226
  chunks = _extract_chunks_from_output(out)
227
 
228
+ # 🔥 FIX: Clean up ASR artifacts (repeated words/phrases, hallucinations)
229
+ text = _clean_transcript(text)
230
+ for chunk in chunks:
231
+ if chunk.get("text"):
232
+ chunk["text"] = _clean_transcript(chunk["text"])
233
+
234
  elapsed = time.time() - start_time
235
+ logger.info("[ASR] Transcription completed in %.2fs (%.2fx realtime)",
236
+ elapsed, elapsed / duration if duration else 0)
237
 
238
  return text, chunks
239
 
app/jobs/transcribe_job.py CHANGED
@@ -50,24 +50,15 @@ def transcribe_job(audio_url: str, note_id: str, user_id: str | None = None):
50
  wav_path = download_audio(audio_url)
51
  logger.info("[JOB] Downloaded audio in %.2fs", time.time() - download_start)
52
 
53
- # Ensure WAV is 16k mono for consistent chunking and ASR behavior
54
  try:
55
  info = get_audio_info(wav_path) or {}
56
  logger.info("[JOB] Audio info: duration=%.2fs, samplerate=%s, channels=%s",
57
  info.get("duration", 0), info.get("samplerate"), info.get("channels"))
58
- if info.get("samplerate") != 16000 or info.get("channels") != 1:
59
- convert_start = time.time()
60
- tmp_wav = make_temp_path(suffix=".wav")
61
- ensure_wav_16k_mono(wav_path, tmp_wav)
62
- logger.info("[JOB] Converted to 16k mono in %.2fs", time.time() - convert_start)
63
- # replace wav_path with converted file and remove original
64
- try:
65
- os.remove(wav_path)
66
- except Exception:
67
- pass
68
- wav_path = tmp_wav
69
  except Exception:
70
- logger.exception("Failed to ensure wav format for %s", wav_path)
71
 
72
  # 2️⃣ ASR - 🔥 SINGLE INFERENCE using unified function
73
  asr_start = time.time()
 
50
  wav_path = download_audio(audio_url)
51
  logger.info("[JOB] Downloaded audio in %.2fs", time.time() - download_start)
52
 
53
+ # Log audio info (skip conversion since client already sends 16kHz mono WAV)
54
  try:
55
  info = get_audio_info(wav_path) or {}
56
  logger.info("[JOB] Audio info: duration=%.2fs, samplerate=%s, channels=%s",
57
  info.get("duration", 0), info.get("samplerate"), info.get("channels"))
58
+ # 🔥 FIX: Skip conversion - client already pre-processes to 16kHz mono WAV
59
+ # Unnecessary conversion may cause audio quality degradation
 
 
 
 
 
 
 
 
 
60
  except Exception:
61
+ logger.exception("Failed to get audio info for %s", wav_path)
62
 
63
  # 2️⃣ ASR - 🔥 SINGLE INFERENCE using unified function
64
  asr_start = time.time()