Spaces:

bichnhan2701
/

PhoWhisperBaseAPI

Sleeping

App Files Files Community

bichnhan2701 commited on Jan 7

Commit

f58f9b6

1 Parent(s): 2003560

Tempt remove preprocess

Browse files

Files changed (2) hide show

app/core/asr_engine.py +68 -23
app/jobs/transcribe_job.py +4 -13

app/core/asr_engine.py CHANGED Viewed

@@ -30,27 +30,75 @@ def _clean_transcript(text: str) -> str:
         return ""
     # 1. Remove excessive dots (more than 3 consecutive)
-    text = re.sub(r'\.{4,}', '...', text)
     # 2. Remove repeated single words (e.g., "chuyền chuyền chuyền...")
-    # Match word repeated 3+ times consecutively
     text = re.sub(r'\b(\w+)(\s+\1){2,}\b', r'\1', text, flags=re.IGNORECASE)
-    # 3. Remove repeated short phrases (2-3 words repeated 3+ times)
-    text = re.sub(r'((?:\S+\s+){1,3}?)\1{2,}', r'\1', text)
-    # 4. Clean up multiple spaces
     text = re.sub(r'\s+', ' ', text)
-    # 5. Clean up space before punctuation
     text = re.sub(r'\s+([.,!?])', r'\1', text)
-    # 6. Remove trailing/leading dots and spaces
     text = text.strip(' .')
     return text
 def _deduplicate_chunks(prev_text: str, curr_text: str, overlap_words: int = 15) -> str:
     """
     Remove overlapping text between consecutive chunks.
@@ -141,7 +189,7 @@ def transcribe_file_unified(
 ) -> Tuple[str, List[Dict]]:
     """
     🔥 UNIFIED: Return both full transcript text AND timestamped chunks in ONE inference pass.
-    This avoids the costly double-inference that was causing timeouts.
     Returns:
         (text, chunks) where chunks = [{"start": float, "end": float, "text": str}, ...]
@@ -152,27 +200,17 @@ def transcribe_file_unified(
     start_time = time.time()
     logger.info("[ASR] Starting unified transcription for %s", wav_path)
-    # If audio is long, prefer chunked inference to avoid memory/time issues
     info = get_audio_info(wav_path) or {}
     duration = info.get("duration", 0)
     logger.info("[ASR] Audio duration: %.2fs", duration)
-    if duration and duration > chunk_length_s:
-        try:
-            text, chunks = transcribe_long_audio(
-                model, wav_path, chunk_length_s=chunk_length_s, overlap_s=stride_s
-            )
-            elapsed = time.time() - start_time
-            logger.info("[ASR] Long audio transcription completed in %.2fs (%.2fx realtime)", elapsed, elapsed / duration if duration else 0)
-            return text, chunks
-        except Exception:
-            logger.exception("transcribe_long_audio failed, falling back to pipeline")
-    # Short audio: single pipeline call with timestamps
     out = model(
         wav_path,
         chunk_length_s=chunk_length_s,
-        stride_length_s=stride_s,
         return_timestamps=True,
     )
@@ -187,8 +225,15 @@ def transcribe_file_unified(
     # Extract chunks with timestamps
     chunks = _extract_chunks_from_output(out)
     elapsed = time.time() - start_time
-    logger.info("[ASR] Short audio transcription completed in %.2fs", elapsed)
     return text, chunks

         return ""
     # 1. Remove excessive dots (more than 3 consecutive)
+    text = re.sub(r'\.{4,}', '.', text)
     # 2. Remove repeated single words (e.g., "chuyền chuyền chuyền...")
+    # Match word repeated 2+ times consecutively
     text = re.sub(r'\b(\w+)(\s+\1){2,}\b', r'\1', text, flags=re.IGNORECASE)
+    # 3. Remove repeated short phrases (2-5 words repeated 2+ times)
+    # More aggressive pattern to catch "biết chính xác mình cần làm" repeats
+    for phrase_len in [5, 4, 3, 2]:
+        pattern = r'((?:\S+\s+){' + str(phrase_len) + r'})\1{1,}'
+        text = re.sub(pattern, r'\1', text)
+    # 4. Remove long repeated phrases (like "thế giới trên cầu" repeated many times)
+    # Find and remove sequences where same phrase appears 3+ times
+    words = text.split()
+    if len(words) > 10:
+        text = _remove_long_repeats(text)
+    # 5. Clean up multiple spaces
     text = re.sub(r'\s+', ' ', text)
+    # 6. Clean up space before punctuation
     text = re.sub(r'\s+([.,!?])', r'\1', text)
+    # 7. Remove trailing/leading dots and spaces
     text = text.strip(' .')
     return text
+def _remove_long_repeats(text: str) -> str:
+    """
+    Remove long repeated phrases that regex can't easily catch.
+    Looks for phrases of 3-8 words that repeat consecutively.
+    """
+    words = text.split()
+    if len(words) < 10:
+        return text
+    result = []
+    i = 0
+    while i < len(words):
+        # Try to find repeating patterns of length 3-8 words
+        found_repeat = False
+        for phrase_len in range(8, 2, -1):  # Check longer phrases first
+            if i + phrase_len * 2 > len(words):
+                continue
+            phrase = words[i:i+phrase_len]
+            next_phrase = words[i+phrase_len:i+phrase_len*2]
+            if phrase == next_phrase:
+                # Found a repeat, skip all consecutive repeats
+                result.extend(phrase)
+                j = i + phrase_len
+                while j + phrase_len <= len(words) and words[j:j+phrase_len] == phrase:
+                    j += phrase_len
+                i = j
+                found_repeat = True
+                break
+        if not found_repeat:
+            result.append(words[i])
+            i += 1
+    return ' '.join(result)
 def _deduplicate_chunks(prev_text: str, curr_text: str, overlap_words: int = 15) -> str:
     """
     Remove overlapping text between consecutive chunks.
 ) -> Tuple[str, List[Dict]]:
     """
     🔥 UNIFIED: Return both full transcript text AND timestamped chunks in ONE inference pass.
+    Uses Whisper's built-in chunking mechanism instead of manual splitting to avoid hallucination.
     Returns:
         (text, chunks) where chunks = [{"start": float, "end": float, "text": str}, ...]
     start_time = time.time()
     logger.info("[ASR] Starting unified transcription for %s", wav_path)
     info = get_audio_info(wav_path) or {}
     duration = info.get("duration", 0)
     logger.info("[ASR] Audio duration: %.2fs", duration)
+    # 🔥 FIX: Always use single pipeline call with Whisper's built-in chunking
+    # Manual chunking causes text repetition and hallucination
+    # Whisper's internal chunking handles long audio properly
     out = model(
         wav_path,
         chunk_length_s=chunk_length_s,
+        stride_length_s=(chunk_length_s // 6, chunk_length_s // 6),  # ~5s left/right context
         return_timestamps=True,
     )
     # Extract chunks with timestamps
     chunks = _extract_chunks_from_output(out)
+    # 🔥 FIX: Clean up ASR artifacts (repeated words/phrases, hallucinations)
+    text = _clean_transcript(text)
+    for chunk in chunks:
+        if chunk.get("text"):
+            chunk["text"] = _clean_transcript(chunk["text"])
     elapsed = time.time() - start_time
+    logger.info("[ASR] Transcription completed in %.2fs (%.2fx realtime)",
+                elapsed, elapsed / duration if duration else 0)
     return text, chunks

app/jobs/transcribe_job.py CHANGED Viewed

@@ -50,24 +50,15 @@ def transcribe_job(audio_url: str, note_id: str, user_id: str | None = None):
         wav_path = download_audio(audio_url)
         logger.info("[JOB] Downloaded audio in %.2fs", time.time() - download_start)
-        # Ensure WAV is 16k mono for consistent chunking and ASR behavior
         try:
             info = get_audio_info(wav_path) or {}
             logger.info("[JOB] Audio info: duration=%.2fs, samplerate=%s, channels=%s",
                         info.get("duration", 0), info.get("samplerate"), info.get("channels"))
-            if info.get("samplerate") != 16000 or info.get("channels") != 1:
-                convert_start = time.time()
-                tmp_wav = make_temp_path(suffix=".wav")
-                ensure_wav_16k_mono(wav_path, tmp_wav)
-                logger.info("[JOB] Converted to 16k mono in %.2fs", time.time() - convert_start)
-                # replace wav_path with converted file and remove original
-                try:
-                    os.remove(wav_path)
-                except Exception:
-                    pass
-                wav_path = tmp_wav
         except Exception:
-            logger.exception("Failed to ensure wav format for %s", wav_path)
         # 2️⃣ ASR - 🔥 SINGLE INFERENCE using unified function
         asr_start = time.time()

         wav_path = download_audio(audio_url)
         logger.info("[JOB] Downloaded audio in %.2fs", time.time() - download_start)
+        # Log audio info (skip conversion since client already sends 16kHz mono WAV)
         try:
             info = get_audio_info(wav_path) or {}
             logger.info("[JOB] Audio info: duration=%.2fs, samplerate=%s, channels=%s",
                         info.get("duration", 0), info.get("samplerate"), info.get("channels"))
+            # 🔥 FIX: Skip conversion - client already pre-processes to 16kHz mono WAV
+            # Unnecessary conversion may cause audio quality degradation
         except Exception:
+            logger.exception("Failed to get audio info for %s", wav_path)
         # 2️⃣ ASR - 🔥 SINGLE INFERENCE using unified function
         asr_start = time.time()