Spaces:

CleanSong-AI
/

whisper-transcriber

Running

App Files Files Community

CleanSong commited on Nov 6, 2025

Commit

e96d8eb

verified ·

1 Parent(s): c53f9a3

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -16

app.py CHANGED Viewed

@@ -1,20 +1,46 @@
 import gradio as gr
 import torch
 import torchaudio
-import os, json
 from faster_whisper import WhisperModel
-# === Load model once ===
-device = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
 COMPUTE_TYPE = "float16" if torch.cuda.is_available() else "int8"
-model = WhisperModel(
-    MODEL_NAME,
-    device=device,
-    compute_type=COMPUTE_TYPE,  # float16 on GPU → identical timestamp precision to OpenAI
 )
 def transcribe(file_path):
     # --- Ensure proper audio format ---
     wav, sr = torchaudio.load(file_path)
@@ -26,37 +52,47 @@ def transcribe(file_path):
     torchaudio.save(fixed_path, wav, 16000)
     # --- Transcribe ---
     segments, info = model.transcribe(
         fixed_path,
         beam_size=5,
         word_timestamps=True,
-        vad_filter=True,        # helps prevent drift in pauses
-        suppress_silence=True
     )
     # --- Build transcript list ---
     transcript = []
     for seg in segments:
         for w in seg.words:
             transcript.append({
-                "word": w.word.strip(),
                 "start": w.start,
-                "end": w.end
             })
     if not transcript:
-        transcript = [{"text": seg.text, "start": seg.start, "end": seg.end} for seg in segments]
-    print(f"✅ Transcribed {len(transcript)} words")
     return transcript
 iface = gr.Interface(
     fn=transcribe,
     inputs=gr.Audio(type="filepath", label="Upload Vocals"),
-    outputs=gr.JSON(label="Transcript"),
     title="CleanSong AI — Whisper Transcriber (Faster-Whisper Large-V3)",
-    description="High-accuracy transcription with precise per-word timestamps at 16 kHz mono (float16)."
 )
 if __name__ == "__main__":

 import gradio as gr
 import torch
 import torchaudio
+import os, json, requests
 from faster_whisper import WhisperModel
+# === CONFIG ===
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
 COMPUTE_TYPE = "float16" if torch.cuda.is_available() else "int8"
+BAD_WORD_URL = (
+    "https://raw.githubusercontent.com/LDNOOBW/"
+    "List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
 )
+# === LOAD PROFANITY LIST ===
+def get_bad_words():
+    try:
+        print(f"🌐 Fetching bad-word list from GitHub…")
+        r = requests.get(BAD_WORD_URL, timeout=10)
+        if r.status_code == 200:
+            words = set(
+                w.strip().lower() for w in r.text.splitlines() if w.strip()
+            )
+            print(f"✅ Loaded {len(words)} bad words.")
+            return words
+    except Exception as e:
+        print(f"⚠️ Failed to fetch list: {e}")
+    # fallback local list
+    fallback = {"fuck", "shit", "bitch", "ass", "nigga", "nigger", "pussy", "cunt"}
+    print(f"⚠️ Using fallback list ({len(fallback)} words).")
+    return fallback
+BAD_WORDS = get_bad_words()
+# === LOAD MODEL ===
+print(f"🚀 Loading Whisper model: {MODEL_NAME} ({COMPUTE_TYPE}) on {DEVICE}")
+model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE)
+print("✅ Model ready!")
+# === FUNCTION ===
 def transcribe(file_path):
     # --- Ensure proper audio format ---
     wav, sr = torchaudio.load(file_path)
     torchaudio.save(fixed_path, wav, 16000)
     # --- Transcribe ---
+    print("🎧 Starting transcription…")
     segments, info = model.transcribe(
         fixed_path,
         beam_size=5,
         word_timestamps=True,
+        vad_filter=True,
+        suppress_silence=True,
     )
     # --- Build transcript list ---
     transcript = []
     for seg in segments:
         for w in seg.words:
+            word = w.word.strip()
             transcript.append({
+                "word": word,
                 "start": w.start,
+                "end": w.end,
+                "explicit": word.lower() in BAD_WORDS
             })
     if not transcript:
+        transcript = [{
+            "text": seg.text,
+            "start": seg.start,
+            "end": seg.end,
+            "explicit": False
+        } for seg in segments]
+    print(f"✅ Transcribed {len(transcript)} words "
+          f"({sum(1 for w in transcript if w['explicit'])} explicit).")
     return transcript
 iface = gr.Interface(
     fn=transcribe,
     inputs=gr.Audio(type="filepath", label="Upload Vocals"),
+    outputs=gr.JSON(label="Transcript with Explicit Flags"),
     title="CleanSong AI — Whisper Transcriber (Faster-Whisper Large-V3)",
+    description="Transcribes vocals with per-word timestamps and explicit-word flags "
+                "(auto-updated bad-word list)."
 )
 if __name__ == "__main__":