Spaces:

CleanSong-AI
/

whisper-transcriber

Running

App Files Files Community

CleanSong commited on Nov 17, 2025

Commit

6ab727a

verified ·

1 Parent(s): be47ae1

Update app.py

Browse files

Files changed (1) hide show

app.py +63 -79

app.py CHANGED Viewed

@@ -41,18 +41,17 @@ print("✅ Models ready!")
 # === TRANSCRIBE FUNCTION (HYBRID WORD-LEVEL) ===
 def transcribe(file_path):
-    # --- Ensure proper audio format (mono, 16k) ---
     wav, sr = torchaudio.load(file_path)
     target_sr = 16000
     if sr != target_sr:
         wav = torchaudio.functional.resample(wav, sr, target_sr)
     if wav.shape[0] > 1:
-        wav = wav.mean(dim=0, keepdim=True)  # mono
     fixed_path = "input_fixed.wav"
     torchaudio.save(fixed_path, wav, target_sr)
-    # --- FAST PASS (cheap) ---
-    print("⚡ Running fast pass to detect candidate explicit words…")
     fast_segments, fast_info = fast_model.transcribe(
         fixed_path,
         beam_size=1,
@@ -61,101 +60,86 @@ def transcribe(file_path):
     )
     sample_rate = getattr(fast_info, "sample_rate", target_sr)
-    # Build initial transcript
     transcript = []
     for seg in fast_segments:
         if hasattr(seg, "words") and seg.words:
             for w in seg.words:
                 word_text = w.word.strip()
-                start = float(w.start)
-                end = float(w.end)
                 transcript.append({
                     "word": word_text,
-                    "start": start,
-                    "end": end,
-                    "explicit": word_text.lower() in BAD_WORDS
                 })
         else:
             transcript.append({
                 "text": seg.text,
                 "start": float(seg.start),
                 "end": float(seg.end),
-                "explicit": False
             })
-    # --- SECOND PASS: large model on explicit words only ---
     flagged_words = [t for t in transcript if t.get("explicit")]
-    if flagged_words:
-        print(f"🔎 Fast pass flagged {len(flagged_words)} explicit words — refining with large model…")
-        refined_entries = []
-        for idx, w in enumerate(flagged_words):
-            s, e = w["start"], w["end"]
-            print(f"⏱️ Refining word {idx+1}/{len(flagged_words)}: {s:.2f}s -> {e:.2f}s")
-            start_sample = int(max(0, s * sample_rate))
-            end_sample = int(min(wav.shape[-1], e * sample_rate))
-            num_frames = max(0, end_sample - start_sample)
-            if num_frames == 0:
-                continue
-            chunk = wav[:, start_sample:end_sample]
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
-                temp_path = tmp.name
-            torchaudio.save(temp_path, chunk, sample_rate)
-            segs, _ = large_model.transcribe(
-                temp_path,
-                beam_size=5,
-                word_timestamps=True,
-                vad_filter=True
-            )
-            for seg in segs:
-                if hasattr(seg, "words") and seg.words:
-                    for word_obj in seg.words:
-                        refined_entries.append({
-                            "word": word_obj.word.strip(),
-                            "start": float(word_obj.start) + s,
-                            "end": float(word_obj.end) + s,
-                            "explicit": word_obj.word.strip().lower() in BAD_WORDS
-                        })
-                else:
                     refined_entries.append({
-                        "text": seg.text,
-                        "start": float(seg.start) + s,
-                        "end": float(seg.end) + s,
-                        "explicit": False
                     })
-            try:
-                os.remove(temp_path)
-            except Exception:
-                pass
-        # Merge refined words back into transcript
-        final_transcript = []
-        for t in transcript:
-            if t.get("explicit") and refined_entries:
-                final_transcript.append(refined_entries.pop(0))
-            else:
-                final_transcript.append(t)
-        transcript = final_transcript
-    else:
-        print("✅ No flagged words — skipping large-model refinement.")
-    # --- fallback if transcript empty ---
-    if not transcript:
-        transcript = [{
-            "text": seg.text,
-            "start": float(seg.start),
-            "end": float(seg.end),
-            "explicit": False
-        } for seg in fast_segments]
-    print(f"✅ Final transcript contains {len(transcript)} entries "
-          f"({sum(1 for w in transcript if w.get('explicit'))} explicit). {transcript[:200]}")
-    return transcript
 # === GRADIO INTERFACE ===
 iface = gr.Interface(

 # === TRANSCRIBE FUNCTION (HYBRID WORD-LEVEL) ===
 def transcribe(file_path):
+    # --- Ensure proper audio format ---
     wav, sr = torchaudio.load(file_path)
     target_sr = 16000
     if sr != target_sr:
         wav = torchaudio.functional.resample(wav, sr, target_sr)
     if wav.shape[0] > 1:
+        wav = wav.mean(dim=0, keepdim=True)
     fixed_path = "input_fixed.wav"
     torchaudio.save(fixed_path, wav, target_sr)
+    # --- FAST PASS ---
     fast_segments, fast_info = fast_model.transcribe(
         fixed_path,
         beam_size=1,
     )
     sample_rate = getattr(fast_info, "sample_rate", target_sr)
+    # Initial transcript with explicit flags
     transcript = []
     for seg in fast_segments:
         if hasattr(seg, "words") and seg.words:
             for w in seg.words:
                 word_text = w.word.strip()
+                is_explicit = word_text.lower() in BAD_WORDS
                 transcript.append({
                     "word": word_text,
+                    "start": float(w.start),
+                    "end": float(w.end),
+                    "explicit": is_explicit,   # 🔥 keep fast-pass explicit flag
+                    "explicit_fast": is_explicit  # permanent record of fast-pass
                 })
         else:
             transcript.append({
                 "text": seg.text,
                 "start": float(seg.start),
                 "end": float(seg.end),
+                "explicit": False,
+                "explicit_fast": False
             })
+    # --- SECOND PASS: refine explicit words only ---
     flagged_words = [t for t in transcript if t.get("explicit")]
+    refined_entries = []
+    for idx, w in enumerate(flagged_words):
+        s, e = w["start"], w["end"]
+        start_sample = int(max(0, s * sample_rate))
+        end_sample = int(min(wav.shape[-1], e * sample_rate))
+        chunk = wav[:, start_sample:end_sample]
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
+            temp_path = tmp.name
+        torchaudio.save(temp_path, chunk, sample_rate)
+        segs, _ = large_model.transcribe(
+            temp_path,
+            beam_size=5,
+            word_timestamps=True,
+            vad_filter=True
+        )
+        for seg in segs:
+            if hasattr(seg, "words") and seg.words:
+                for word_obj in seg.words:
+                    # 🔥 Keep explicit from fast-pass instead of trusting large model
+                    orig_explicit = w.get("explicit_fast", False)
                     refined_entries.append({
+                        "word": word_obj.word.strip(),
+                        "start": float(word_obj.start) + s,
+                        "end": float(word_obj.end) + s,
+                        "explicit": orig_explicit,  # preserve explicit
+                        "explicit_fast": orig_explicit
                     })
+            else:
+                refined_entries.append({
+                    "text": seg.text,
+                    "start": float(seg.start) + s,
+                    "end": float(seg.end) + s,
+                    "explicit": w.get("explicit_fast", False),
+                    "explicit_fast": w.get("explicit_fast", False)
+                })
+        try:
+            os.remove(temp_path)
+        except Exception:
+            pass
+    # Merge refined words back, keeping fast-pass explicit
+    final_transcript = []
+    refined_iter = iter(refined_entries)
+    for t in transcript:
+        if t.get("explicit"):
+            final_transcript.append(next(refined_iter))
+        else:
+            final_transcript.append(t)
+    return final_transcript
 # === GRADIO INTERFACE ===
 iface = gr.Interface(