Spaces:
Sleeping
Sleeping
Commit
·
f58f9b6
1
Parent(s):
2003560
Tempt remove preprocess
Browse files- app/core/asr_engine.py +68 -23
- app/jobs/transcribe_job.py +4 -13
app/core/asr_engine.py
CHANGED
|
@@ -30,27 +30,75 @@ def _clean_transcript(text: str) -> str:
|
|
| 30 |
return ""
|
| 31 |
|
| 32 |
# 1. Remove excessive dots (more than 3 consecutive)
|
| 33 |
-
text = re.sub(r'\.{4,}', '
|
| 34 |
|
| 35 |
# 2. Remove repeated single words (e.g., "chuyền chuyền chuyền...")
|
| 36 |
-
# Match word repeated
|
| 37 |
text = re.sub(r'\b(\w+)(\s+\1){2,}\b', r'\1', text, flags=re.IGNORECASE)
|
| 38 |
|
| 39 |
-
# 3. Remove repeated short phrases (2-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
-
# 4.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
text = re.sub(r'\s+', ' ', text)
|
| 44 |
|
| 45 |
-
#
|
| 46 |
text = re.sub(r'\s+([.,!?])', r'\1', text)
|
| 47 |
|
| 48 |
-
#
|
| 49 |
text = text.strip(' .')
|
| 50 |
|
| 51 |
return text
|
| 52 |
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
def _deduplicate_chunks(prev_text: str, curr_text: str, overlap_words: int = 15) -> str:
|
| 55 |
"""
|
| 56 |
Remove overlapping text between consecutive chunks.
|
|
@@ -141,7 +189,7 @@ def transcribe_file_unified(
|
|
| 141 |
) -> Tuple[str, List[Dict]]:
|
| 142 |
"""
|
| 143 |
🔥 UNIFIED: Return both full transcript text AND timestamped chunks in ONE inference pass.
|
| 144 |
-
|
| 145 |
|
| 146 |
Returns:
|
| 147 |
(text, chunks) where chunks = [{"start": float, "end": float, "text": str}, ...]
|
|
@@ -152,27 +200,17 @@ def transcribe_file_unified(
|
|
| 152 |
start_time = time.time()
|
| 153 |
logger.info("[ASR] Starting unified transcription for %s", wav_path)
|
| 154 |
|
| 155 |
-
# If audio is long, prefer chunked inference to avoid memory/time issues
|
| 156 |
info = get_audio_info(wav_path) or {}
|
| 157 |
duration = info.get("duration", 0)
|
| 158 |
logger.info("[ASR] Audio duration: %.2fs", duration)
|
| 159 |
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
model, wav_path, chunk_length_s=chunk_length_s, overlap_s=stride_s
|
| 164 |
-
)
|
| 165 |
-
elapsed = time.time() - start_time
|
| 166 |
-
logger.info("[ASR] Long audio transcription completed in %.2fs (%.2fx realtime)", elapsed, elapsed / duration if duration else 0)
|
| 167 |
-
return text, chunks
|
| 168 |
-
except Exception:
|
| 169 |
-
logger.exception("transcribe_long_audio failed, falling back to pipeline")
|
| 170 |
-
|
| 171 |
-
# Short audio: single pipeline call with timestamps
|
| 172 |
out = model(
|
| 173 |
wav_path,
|
| 174 |
chunk_length_s=chunk_length_s,
|
| 175 |
-
stride_length_s=
|
| 176 |
return_timestamps=True,
|
| 177 |
)
|
| 178 |
|
|
@@ -187,8 +225,15 @@ def transcribe_file_unified(
|
|
| 187 |
# Extract chunks with timestamps
|
| 188 |
chunks = _extract_chunks_from_output(out)
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
elapsed = time.time() - start_time
|
| 191 |
-
logger.info("[ASR]
|
|
|
|
| 192 |
|
| 193 |
return text, chunks
|
| 194 |
|
|
|
|
| 30 |
return ""
|
| 31 |
|
| 32 |
# 1. Remove excessive dots (more than 3 consecutive)
|
| 33 |
+
text = re.sub(r'\.{4,}', '.', text)
|
| 34 |
|
| 35 |
# 2. Remove repeated single words (e.g., "chuyền chuyền chuyền...")
|
| 36 |
+
# Match word repeated 2+ times consecutively
|
| 37 |
text = re.sub(r'\b(\w+)(\s+\1){2,}\b', r'\1', text, flags=re.IGNORECASE)
|
| 38 |
|
| 39 |
+
# 3. Remove repeated short phrases (2-5 words repeated 2+ times)
|
| 40 |
+
# More aggressive pattern to catch "biết chính xác mình cần làm" repeats
|
| 41 |
+
for phrase_len in [5, 4, 3, 2]:
|
| 42 |
+
pattern = r'((?:\S+\s+){' + str(phrase_len) + r'})\1{1,}'
|
| 43 |
+
text = re.sub(pattern, r'\1', text)
|
| 44 |
|
| 45 |
+
# 4. Remove long repeated phrases (like "thế giới trên cầu" repeated many times)
|
| 46 |
+
# Find and remove sequences where same phrase appears 3+ times
|
| 47 |
+
words = text.split()
|
| 48 |
+
if len(words) > 10:
|
| 49 |
+
text = _remove_long_repeats(text)
|
| 50 |
+
|
| 51 |
+
# 5. Clean up multiple spaces
|
| 52 |
text = re.sub(r'\s+', ' ', text)
|
| 53 |
|
| 54 |
+
# 6. Clean up space before punctuation
|
| 55 |
text = re.sub(r'\s+([.,!?])', r'\1', text)
|
| 56 |
|
| 57 |
+
# 7. Remove trailing/leading dots and spaces
|
| 58 |
text = text.strip(' .')
|
| 59 |
|
| 60 |
return text
|
| 61 |
|
| 62 |
|
| 63 |
+
def _remove_long_repeats(text: str) -> str:
|
| 64 |
+
"""
|
| 65 |
+
Remove long repeated phrases that regex can't easily catch.
|
| 66 |
+
Looks for phrases of 3-8 words that repeat consecutively.
|
| 67 |
+
"""
|
| 68 |
+
words = text.split()
|
| 69 |
+
if len(words) < 10:
|
| 70 |
+
return text
|
| 71 |
+
|
| 72 |
+
result = []
|
| 73 |
+
i = 0
|
| 74 |
+
|
| 75 |
+
while i < len(words):
|
| 76 |
+
# Try to find repeating patterns of length 3-8 words
|
| 77 |
+
found_repeat = False
|
| 78 |
+
for phrase_len in range(8, 2, -1): # Check longer phrases first
|
| 79 |
+
if i + phrase_len * 2 > len(words):
|
| 80 |
+
continue
|
| 81 |
+
|
| 82 |
+
phrase = words[i:i+phrase_len]
|
| 83 |
+
next_phrase = words[i+phrase_len:i+phrase_len*2]
|
| 84 |
+
|
| 85 |
+
if phrase == next_phrase:
|
| 86 |
+
# Found a repeat, skip all consecutive repeats
|
| 87 |
+
result.extend(phrase)
|
| 88 |
+
j = i + phrase_len
|
| 89 |
+
while j + phrase_len <= len(words) and words[j:j+phrase_len] == phrase:
|
| 90 |
+
j += phrase_len
|
| 91 |
+
i = j
|
| 92 |
+
found_repeat = True
|
| 93 |
+
break
|
| 94 |
+
|
| 95 |
+
if not found_repeat:
|
| 96 |
+
result.append(words[i])
|
| 97 |
+
i += 1
|
| 98 |
+
|
| 99 |
+
return ' '.join(result)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
def _deduplicate_chunks(prev_text: str, curr_text: str, overlap_words: int = 15) -> str:
|
| 103 |
"""
|
| 104 |
Remove overlapping text between consecutive chunks.
|
|
|
|
| 189 |
) -> Tuple[str, List[Dict]]:
|
| 190 |
"""
|
| 191 |
🔥 UNIFIED: Return both full transcript text AND timestamped chunks in ONE inference pass.
|
| 192 |
+
Uses Whisper's built-in chunking mechanism instead of manual splitting to avoid hallucination.
|
| 193 |
|
| 194 |
Returns:
|
| 195 |
(text, chunks) where chunks = [{"start": float, "end": float, "text": str}, ...]
|
|
|
|
| 200 |
start_time = time.time()
|
| 201 |
logger.info("[ASR] Starting unified transcription for %s", wav_path)
|
| 202 |
|
|
|
|
| 203 |
info = get_audio_info(wav_path) or {}
|
| 204 |
duration = info.get("duration", 0)
|
| 205 |
logger.info("[ASR] Audio duration: %.2fs", duration)
|
| 206 |
|
| 207 |
+
# 🔥 FIX: Always use single pipeline call with Whisper's built-in chunking
|
| 208 |
+
# Manual chunking causes text repetition and hallucination
|
| 209 |
+
# Whisper's internal chunking handles long audio properly
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
out = model(
|
| 211 |
wav_path,
|
| 212 |
chunk_length_s=chunk_length_s,
|
| 213 |
+
stride_length_s=(chunk_length_s // 6, chunk_length_s // 6), # ~5s left/right context
|
| 214 |
return_timestamps=True,
|
| 215 |
)
|
| 216 |
|
|
|
|
| 225 |
# Extract chunks with timestamps
|
| 226 |
chunks = _extract_chunks_from_output(out)
|
| 227 |
|
| 228 |
+
# 🔥 FIX: Clean up ASR artifacts (repeated words/phrases, hallucinations)
|
| 229 |
+
text = _clean_transcript(text)
|
| 230 |
+
for chunk in chunks:
|
| 231 |
+
if chunk.get("text"):
|
| 232 |
+
chunk["text"] = _clean_transcript(chunk["text"])
|
| 233 |
+
|
| 234 |
elapsed = time.time() - start_time
|
| 235 |
+
logger.info("[ASR] Transcription completed in %.2fs (%.2fx realtime)",
|
| 236 |
+
elapsed, elapsed / duration if duration else 0)
|
| 237 |
|
| 238 |
return text, chunks
|
| 239 |
|
app/jobs/transcribe_job.py
CHANGED
|
@@ -50,24 +50,15 @@ def transcribe_job(audio_url: str, note_id: str, user_id: str | None = None):
|
|
| 50 |
wav_path = download_audio(audio_url)
|
| 51 |
logger.info("[JOB] Downloaded audio in %.2fs", time.time() - download_start)
|
| 52 |
|
| 53 |
-
#
|
| 54 |
try:
|
| 55 |
info = get_audio_info(wav_path) or {}
|
| 56 |
logger.info("[JOB] Audio info: duration=%.2fs, samplerate=%s, channels=%s",
|
| 57 |
info.get("duration", 0), info.get("samplerate"), info.get("channels"))
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
tmp_wav = make_temp_path(suffix=".wav")
|
| 61 |
-
ensure_wav_16k_mono(wav_path, tmp_wav)
|
| 62 |
-
logger.info("[JOB] Converted to 16k mono in %.2fs", time.time() - convert_start)
|
| 63 |
-
# replace wav_path with converted file and remove original
|
| 64 |
-
try:
|
| 65 |
-
os.remove(wav_path)
|
| 66 |
-
except Exception:
|
| 67 |
-
pass
|
| 68 |
-
wav_path = tmp_wav
|
| 69 |
except Exception:
|
| 70 |
-
logger.exception("Failed to
|
| 71 |
|
| 72 |
# 2️⃣ ASR - 🔥 SINGLE INFERENCE using unified function
|
| 73 |
asr_start = time.time()
|
|
|
|
| 50 |
wav_path = download_audio(audio_url)
|
| 51 |
logger.info("[JOB] Downloaded audio in %.2fs", time.time() - download_start)
|
| 52 |
|
| 53 |
+
# Log audio info (skip conversion since client already sends 16kHz mono WAV)
|
| 54 |
try:
|
| 55 |
info = get_audio_info(wav_path) or {}
|
| 56 |
logger.info("[JOB] Audio info: duration=%.2fs, samplerate=%s, channels=%s",
|
| 57 |
info.get("duration", 0), info.get("samplerate"), info.get("channels"))
|
| 58 |
+
# 🔥 FIX: Skip conversion - client already pre-processes to 16kHz mono WAV
|
| 59 |
+
# Unnecessary conversion may cause audio quality degradation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
except Exception:
|
| 61 |
+
logger.exception("Failed to get audio info for %s", wav_path)
|
| 62 |
|
| 63 |
# 2️⃣ ASR - 🔥 SINGLE INFERENCE using unified function
|
| 64 |
asr_start = time.time()
|