| import re |
| import os |
| import tempfile |
|
|
| import gradio as gr |
| import torch |
| import torchaudio |
| import requests |
| from faster_whisper import WhisperModel |
|
|
| |
| |
| |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" |
| MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3") |
| FAST_MODEL_NAME = os.getenv("FAST_WHISPER_MODEL", "base") |
| COMPUTE_TYPE = "float16" if torch.cuda.is_available() else "int8" |
|
|
| BAD_WORD_URL = ( |
| "https://raw.githubusercontent.com/LDNOOBW/" |
| "List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en" |
| ) |
|
|
| |
| |
| |
| def get_bad_words(): |
| try: |
| print("π Fetching bad-word listβ¦") |
| r = requests.get(BAD_WORD_URL, timeout=10) |
| if r.status_code == 200: |
| words = { |
| re.sub(r"[^\w]", "", w.lower()) |
| for line in r.text.splitlines() |
| for w in line.split() |
| if w.strip() |
| } |
| |
| words.update({"hell", "dam", "damn", "yeah"}) |
| print(f"β
Loaded {len(words)} bad words.") |
| return words |
| except Exception as e: |
| print(f"β οΈ Failed to fetch list: {e}") |
|
|
| return {"fuck", "shit", "bitch", "ass", "damn", "hell"} |
|
|
|
|
| BAD_WORDS = get_bad_words() |
|
|
|
|
| |
| |
| |
| def load_audio_safe(path, target_sr=16000): |
| wav, sr = torchaudio.load(path) |
| if wav.shape[0] > 1: |
| wav = wav.mean(dim=0, keepdim=True) |
| if sr != target_sr: |
| wav = torchaudio.functional.resample(wav, sr, target_sr) |
| return wav, target_sr |
|
|
|
|
| |
| |
| |
| print(f"π Loading FAST Whisper: {FAST_MODEL_NAME} ({COMPUTE_TYPE}) on {DEVICE}") |
| fast_model = WhisperModel(FAST_MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE) |
|
|
| print(f"π Loading LARGE Whisper: {MODEL_NAME} ({COMPUTE_TYPE}) on {DEVICE}") |
| large_model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE) |
|
|
| print("β
All models ready!\n") |
|
|
|
|
| |
| |
| |
| def transcribe(file_path): |
|
|
| |
| wav, sr = load_audio_safe(file_path) |
| fixed_path = "input_fixed.wav" |
| torchaudio.save(fixed_path, wav, sr) |
|
|
| |
| |
| |
| fast_segments, fast_info = fast_model.transcribe( |
| fixed_path, |
| beam_size=1, |
| word_timestamps=True, |
| vad_filter=True, |
| ) |
|
|
| transcript = [] |
| sample_rate = getattr(fast_info, "sample_rate", sr) |
|
|
| for seg in fast_segments: |
| if not getattr(seg, "words", None): |
| continue |
| for w in seg.words: |
| |
| clean_word = re.sub(r"[^\w]", "", w.word.strip().lower()) |
| is_explicit = clean_word in BAD_WORDS |
| transcript.append({ |
| "word": w.word.strip(), |
| "start": float(w.start), |
| "end": float(w.end), |
| "explicit": is_explicit, |
| "explicit_fast": is_explicit, |
| }) |
|
|
| |
| |
| |
| flagged = [w for w in transcript if w["explicit_fast"]] |
| if not flagged: |
| print("β
No explicit words detected β returning fast transcript.") |
| return transcript |
|
|
| |
| |
| |
| final = [] |
|
|
| for entry in transcript: |
| |
| if not entry["explicit_fast"]: |
| final.append(entry) |
| continue |
|
|
| |
| start_s = entry["start"] |
| end_s = entry["end"] |
| start_sample = int(start_s * sample_rate) |
| end_sample = int(end_s * sample_rate) |
| chunk = wav[:, start_sample:end_sample] |
|
|
| |
| if chunk.numel() == 0: |
| final.append(entry) |
| continue |
|
|
| |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: |
| chunk_path = tmp.name |
| torchaudio.save(chunk_path, chunk, sample_rate) |
|
|
| |
| try: |
| refined_segs, _ = large_model.transcribe( |
| chunk_path, |
| beam_size=5, |
| word_timestamps=True, |
| vad_filter=False, |
| ) |
| except Exception as e: |
| print(f"β οΈ Large model failed on chunk: {e} β keeping fast result") |
| final.append(entry) |
| os.remove(chunk_path) |
| continue |
|
|
| os.remove(chunk_path) |
|
|
| |
| refined_words = [] |
| for seg in refined_segs: |
| if not getattr(seg, "words", None): |
| continue |
| for w in seg.words: |
| refined_words.append({ |
| "word": w.word.strip(), |
| "start": float(w.start) + start_s, |
| "end": float(w.end) + start_s, |
| "explicit": entry["explicit_fast"], |
| "explicit_fast": entry["explicit_fast"], |
| }) |
|
|
| |
| if not refined_words: |
| final.append(entry) |
| continue |
|
|
| final.extend(refined_words) |
|
|
| |
| final.sort(key=lambda x: x["start"]) |
| return final |
|
|
|
|
| |
| |
| |
| iface = gr.Interface( |
| fn=transcribe, |
| inputs=gr.Audio(type="filepath", label="Upload Vocals"), |
| outputs=gr.JSON(label="Transcript with Explicit Flags"), |
| title="CleanSong AI β Whisper Transcriber", |
| description=( |
| "Fast model detects explicit words β " |
| "Large model refines only those segments. " |
| "Returns word-level timestamps." |
| ), |
| ) |
|
|
| if __name__ == "__main__": |
| iface.launch() |
|
|