Spaces:

CleanSong-AI
/

whisper-transcriber

Running

App Files Files Community

whisper-transcriber / app.py

CleanSong

Update app.py

73c22fe verified 22 days ago

raw

history blame contribute delete

6.49 kB

	import re
	import os
	import tempfile

	import gradio as gr
	import torch
	import torchaudio
	import requests
	from faster_whisper import WhisperModel

	# ================================
	# CONFIG
	# ================================
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
	FAST_MODEL_NAME = os.getenv("FAST_WHISPER_MODEL", "base")
	COMPUTE_TYPE = "float16" if torch.cuda.is_available() else "int8"

	BAD_WORD_URL = (
	"https://raw.githubusercontent.com/LDNOOBW/"
	"List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
	)

	# ================================
	# BAD WORD LIST
	# ================================
	def get_bad_words():
	try:
	print("🌐 Fetching bad-word list…")
	r = requests.get(BAD_WORD_URL, timeout=10)
	if r.status_code == 200:
	words = {
	re.sub(r"[^\w]", "", w.lower())
	for line in r.text.splitlines()
	for w in line.split()
	if w.strip()
	}
	# Extra words to always catch
	words.update({"hell", "dam", "damn", "yeah"})
	print(f"✅ Loaded {len(words)} bad words.")
	return words
	except Exception as e:
	print(f"⚠️ Failed to fetch list: {e}")

	return {"fuck", "shit", "bitch", "ass", "damn", "hell"} # fallback


	BAD_WORDS = get_bad_words()


	# ================================
	# UTILITY: SAFE AUDIO LOAD
	# ================================
	def load_audio_safe(path, target_sr=16000):
	wav, sr = torchaudio.load(path)
	if wav.shape[0] > 1:
	wav = wav.mean(dim=0, keepdim=True)
	if sr != target_sr:
	wav = torchaudio.functional.resample(wav, sr, target_sr)
	return wav, target_sr


	# ================================
	# LOAD MODELS
	# ================================
	print(f"🚀 Loading FAST Whisper: {FAST_MODEL_NAME} ({COMPUTE_TYPE}) on {DEVICE}")
	fast_model = WhisperModel(FAST_MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE)

	print(f"🚀 Loading LARGE Whisper: {MODEL_NAME} ({COMPUTE_TYPE}) on {DEVICE}")
	large_model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE)

	print("✅ All models ready!\n")


	# ================================
	# MAIN TRANSCRIBE FUNCTION
	# ================================
	def transcribe(file_path):

	# Load + normalize audio
	wav, sr = load_audio_safe(file_path)
	fixed_path = "input_fixed.wav"
	torchaudio.save(fixed_path, wav, sr)

	# =====================================
	# 1) FAST PASS — detect explicit words
	# =====================================
	fast_segments, fast_info = fast_model.transcribe(
	fixed_path,
	beam_size=1,
	word_timestamps=True,
	vad_filter=True,
	)

	transcript = []
	sample_rate = getattr(fast_info, "sample_rate", sr)

	for seg in fast_segments:
	if not getattr(seg, "words", None):
	continue
	for w in seg.words:
	# FIX: was incorrectly re-running the bad word set comprehension here
	clean_word = re.sub(r"[^\w]", "", w.word.strip().lower())
	is_explicit = clean_word in BAD_WORDS
	transcript.append({
	"word": w.word.strip(),
	"start": float(w.start),
	"end": float(w.end),
	"explicit": is_explicit,
	"explicit_fast": is_explicit,
	})

	# =====================================
	# EARLY EXIT IF NO EXPLICIT WORDS
	# =====================================
	flagged = [w for w in transcript if w["explicit_fast"]]
	if not flagged:
	print("✅ No explicit words detected — returning fast transcript.")
	return transcript

	# =====================================
	# 2) REFINE PASS — only explicit words
	# =====================================
	final = []

	for entry in transcript:
	# Not explicit — keep untouched
	if not entry["explicit_fast"]:
	final.append(entry)
	continue

	# Extract audio chunk for just this word
	start_s = entry["start"]
	end_s = entry["end"]
	start_sample = int(start_s * sample_rate)
	end_sample = int(end_s * sample_rate)
	chunk = wav[:, start_sample:end_sample]

	# Safety: collapsed timestamp
	if chunk.numel() == 0:
	final.append(entry)
	continue

	# Save chunk to temp file
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	chunk_path = tmp.name
	torchaudio.save(chunk_path, chunk, sample_rate)

	# Run large model on chunk
	try:
	refined_segs, _ = large_model.transcribe(
	chunk_path,
	beam_size=5,
	word_timestamps=True,
	vad_filter=False,
	)
	except Exception as e:
	print(f"⚠️ Large model failed on chunk: {e} — keeping fast result")
	final.append(entry)
	os.remove(chunk_path)
	continue

	os.remove(chunk_path)

	# Extract refined words, offset timestamps back to full-track time
	refined_words = []
	for seg in refined_segs:
	if not getattr(seg, "words", None):
	continue
	for w in seg.words:
	refined_words.append({
	"word": w.word.strip(),
	"start": float(w.start) + start_s,
	"end": float(w.end) + start_s,
	"explicit": entry["explicit_fast"],
	"explicit_fast": entry["explicit_fast"],
	})

	# Fallback if large model returned nothing
	if not refined_words:
	final.append(entry)
	continue

	final.extend(refined_words)

	# Sort by timestamp (critical for assembler)
	final.sort(key=lambda x: x["start"])
	return final


	# ================================
	# GRADIO UI
	# ================================
	iface = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(type="filepath", label="Upload Vocals"),
	outputs=gr.JSON(label="Transcript with Explicit Flags"),
	title="CleanSong AI — Whisper Transcriber",
	description=(
	"Fast model detects explicit words → "
	"Large model refines only those segments. "
	"Returns word-level timestamps."
	),
	)

	if __name__ == "__main__":
	iface.launch()