Spaces:

CleanSong-AI
/

whisper-transcriber

Running

File size: 6,485 Bytes

73c22fe
 
 
 
bce4a85
52f0004
20277ed
316c57c
61386ba
20277ed
20dd96d
 
 
e96d8eb
20dd96d
 
61386ba
20dd96d
e96d8eb
73c22fe
 
61386ba
4927bb9
20dd96d
 
 
e96d8eb
 
20dd96d
e96d8eb
 
a6041ec
 
 
 
 
 
73c22fe
 
20dd96d
e96d8eb
 
 
 
73c22fe
e96d8eb
20dd96d
73c22fe
20dd96d
 
 
 
 
 
 
52f0004
6ab727a
20dd96d
 
 
 
 
73c22fe
 
 
 
 
 
 
 
 
 
 
 
20dd96d
 
 
 
 
73c22fe
20dd96d
52f0004
20dd96d
bce4a85
20dd96d
 
 
316c57c
61386ba
316c57c
61386ba
73c22fe
61386ba
52f0004
 
20dd96d
 
316c57c
73c22fe
20dd96d
73c22fe
 
 
 
 
 
 
 
 
 
 
20dd96d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73c22fe
20dd96d
 
 
 
73c22fe
20dd96d
 
 
 
6ab727a
 
73c22fe
20dd96d
 
 
 
73c22fe
6ab727a
20dd96d
 
7a79027
73c22fe
6ab727a
20dd96d
 
 
 
73c22fe
20dd96d
73c22fe
 
20dd96d
73c22fe
20dd96d
 
 
 
73c22fe
20dd96d
 
73c22fe
 
 
 
 
 
 
 
 
 
 
 
20dd96d
 
 
 
 
 
73c22fe
20dd96d
 
 
 
 
 
 
52f0004
 
 
e96d8eb
73c22fe
 
 
 
 
 
52f0004
bce4a85
52f0004

import re
import os
import tempfile

import gradio as gr
import torch
import torchaudio
import requests
from faster_whisper import WhisperModel

# ================================
# CONFIG
# ================================
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
FAST_MODEL_NAME = os.getenv("FAST_WHISPER_MODEL", "base")
COMPUTE_TYPE = "float16" if torch.cuda.is_available() else "int8"

BAD_WORD_URL = (
    "https://raw.githubusercontent.com/LDNOOBW/"
    "List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
)

# ================================
# BAD WORD LIST
# ================================
def get_bad_words():
    try:
        print("🌐 Fetching bad-word list…")
        r = requests.get(BAD_WORD_URL, timeout=10)
        if r.status_code == 200:
            words = {
                re.sub(r"[^\w]", "", w.lower())
                for line in r.text.splitlines()
                for w in line.split()
                if w.strip()
            }
            # Extra words to always catch
            words.update({"hell", "dam", "damn", "yeah"})
            print(f"✅ Loaded {len(words)} bad words.")
            return words
    except Exception as e:
        print(f"⚠️ Failed to fetch list: {e}")

    return {"fuck", "shit", "bitch", "ass", "damn", "hell"}  # fallback


BAD_WORDS = get_bad_words()


# ================================
# UTILITY: SAFE AUDIO LOAD
# ================================
def load_audio_safe(path, target_sr=16000):
    wav, sr = torchaudio.load(path)
    if wav.shape[0] > 1:
        wav = wav.mean(dim=0, keepdim=True)
    if sr != target_sr:
        wav = torchaudio.functional.resample(wav, sr, target_sr)
    return wav, target_sr


# ================================
# LOAD MODELS
# ================================
print(f"🚀 Loading FAST Whisper: {FAST_MODEL_NAME} ({COMPUTE_TYPE}) on {DEVICE}")
fast_model = WhisperModel(FAST_MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE)

print(f"🚀 Loading LARGE Whisper: {MODEL_NAME} ({COMPUTE_TYPE}) on {DEVICE}")
large_model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE)

print("✅ All models ready!\n")


# ================================
# MAIN TRANSCRIBE FUNCTION
# ================================
def transcribe(file_path):

    # Load + normalize audio
    wav, sr = load_audio_safe(file_path)
    fixed_path = "input_fixed.wav"
    torchaudio.save(fixed_path, wav, sr)

    # =====================================
    # 1) FAST PASS — detect explicit words
    # =====================================
    fast_segments, fast_info = fast_model.transcribe(
        fixed_path,
        beam_size=1,
        word_timestamps=True,
        vad_filter=True,
    )

    transcript = []
    sample_rate = getattr(fast_info, "sample_rate", sr)

    for seg in fast_segments:
        if not getattr(seg, "words", None):
            continue
        for w in seg.words:
            # FIX: was incorrectly re-running the bad word set comprehension here
            clean_word = re.sub(r"[^\w]", "", w.word.strip().lower())
            is_explicit = clean_word in BAD_WORDS
            transcript.append({
                "word": w.word.strip(),
                "start": float(w.start),
                "end": float(w.end),
                "explicit": is_explicit,
                "explicit_fast": is_explicit,
            })

    # =====================================
    # EARLY EXIT IF NO EXPLICIT WORDS
    # =====================================
    flagged = [w for w in transcript if w["explicit_fast"]]
    if not flagged:
        print("✅ No explicit words detected — returning fast transcript.")
        return transcript

    # =====================================
    # 2) REFINE PASS — only explicit words
    # =====================================
    final = []

    for entry in transcript:
        # Not explicit — keep untouched
        if not entry["explicit_fast"]:
            final.append(entry)
            continue

        # Extract audio chunk for just this word
        start_s = entry["start"]
        end_s = entry["end"]
        start_sample = int(start_s * sample_rate)
        end_sample = int(end_s * sample_rate)
        chunk = wav[:, start_sample:end_sample]

        # Safety: collapsed timestamp
        if chunk.numel() == 0:
            final.append(entry)
            continue

        # Save chunk to temp file
        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
            chunk_path = tmp.name
        torchaudio.save(chunk_path, chunk, sample_rate)

        # Run large model on chunk
        try:
            refined_segs, _ = large_model.transcribe(
                chunk_path,
                beam_size=5,
                word_timestamps=True,
                vad_filter=False,
            )
        except Exception as e:
            print(f"⚠️ Large model failed on chunk: {e} — keeping fast result")
            final.append(entry)
            os.remove(chunk_path)
            continue

        os.remove(chunk_path)

        # Extract refined words, offset timestamps back to full-track time
        refined_words = []
        for seg in refined_segs:
            if not getattr(seg, "words", None):
                continue
            for w in seg.words:
                refined_words.append({
                    "word": w.word.strip(),
                    "start": float(w.start) + start_s,
                    "end": float(w.end) + start_s,
                    "explicit": entry["explicit_fast"],
                    "explicit_fast": entry["explicit_fast"],
                })

        # Fallback if large model returned nothing
        if not refined_words:
            final.append(entry)
            continue

        final.extend(refined_words)

    # Sort by timestamp (critical for assembler)
    final.sort(key=lambda x: x["start"])
    return final


# ================================
# GRADIO UI
# ================================
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath", label="Upload Vocals"),
    outputs=gr.JSON(label="Transcript with Explicit Flags"),
    title="CleanSong AI — Whisper Transcriber",
    description=(
        "Fast model detects explicit words → "
        "Large model refines only those segments. "
        "Returns word-level timestamps."
    ),
)

if __name__ == "__main__":
    iface.launch()