CleanSong commited on
Commit
e96d8eb
·
verified ·
1 Parent(s): c53f9a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -16
app.py CHANGED
@@ -1,20 +1,46 @@
1
  import gradio as gr
2
  import torch
3
  import torchaudio
4
- import os, json
5
  from faster_whisper import WhisperModel
6
 
7
- # === Load model once ===
8
- device = "cuda" if torch.cuda.is_available() else "cpu"
9
  MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
10
  COMPUTE_TYPE = "float16" if torch.cuda.is_available() else "int8"
11
-
12
- model = WhisperModel(
13
- MODEL_NAME,
14
- device=device,
15
- compute_type=COMPUTE_TYPE, # float16 on GPU → identical timestamp precision to OpenAI
16
  )
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def transcribe(file_path):
19
  # --- Ensure proper audio format ---
20
  wav, sr = torchaudio.load(file_path)
@@ -26,37 +52,47 @@ def transcribe(file_path):
26
  torchaudio.save(fixed_path, wav, 16000)
27
 
28
  # --- Transcribe ---
 
29
  segments, info = model.transcribe(
30
  fixed_path,
31
  beam_size=5,
32
  word_timestamps=True,
33
- vad_filter=True, # helps prevent drift in pauses
34
- suppress_silence=True
35
  )
36
 
37
  # --- Build transcript list ---
38
  transcript = []
39
  for seg in segments:
40
  for w in seg.words:
 
41
  transcript.append({
42
- "word": w.word.strip(),
43
  "start": w.start,
44
- "end": w.end
 
45
  })
46
 
47
  if not transcript:
48
- transcript = [{"text": seg.text, "start": seg.start, "end": seg.end} for seg in segments]
 
 
 
 
 
49
 
50
- print(f"✅ Transcribed {len(transcript)} words")
 
51
  return transcript
52
 
53
 
54
  iface = gr.Interface(
55
  fn=transcribe,
56
  inputs=gr.Audio(type="filepath", label="Upload Vocals"),
57
- outputs=gr.JSON(label="Transcript"),
58
  title="CleanSong AI — Whisper Transcriber (Faster-Whisper Large-V3)",
59
- description="High-accuracy transcription with precise per-word timestamps at 16 kHz mono (float16)."
 
60
  )
61
 
62
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import torch
3
  import torchaudio
4
+ import os, json, requests
5
  from faster_whisper import WhisperModel
6
 
7
+ # === CONFIG ===
8
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
9
  MODEL_NAME = os.getenv("WHISPER_MODEL", "large-v3")
10
  COMPUTE_TYPE = "float16" if torch.cuda.is_available() else "int8"
11
+ BAD_WORD_URL = (
12
+ "https://raw.githubusercontent.com/LDNOOBW/"
13
+ "List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
 
 
14
  )
15
 
16
+ # === LOAD PROFANITY LIST ===
17
+ def get_bad_words():
18
+ try:
19
+ print(f"🌐 Fetching bad-word list from GitHub…")
20
+ r = requests.get(BAD_WORD_URL, timeout=10)
21
+ if r.status_code == 200:
22
+ words = set(
23
+ w.strip().lower() for w in r.text.splitlines() if w.strip()
24
+ )
25
+ print(f"✅ Loaded {len(words)} bad words.")
26
+ return words
27
+ except Exception as e:
28
+ print(f"⚠️ Failed to fetch list: {e}")
29
+
30
+ # fallback local list
31
+ fallback = {"fuck", "shit", "bitch", "ass", "nigga", "nigger", "pussy", "cunt"}
32
+ print(f"⚠️ Using fallback list ({len(fallback)} words).")
33
+ return fallback
34
+
35
+
36
+ BAD_WORDS = get_bad_words()
37
+
38
+ # === LOAD MODEL ===
39
+ print(f"🚀 Loading Whisper model: {MODEL_NAME} ({COMPUTE_TYPE}) on {DEVICE}")
40
+ model = WhisperModel(MODEL_NAME, device=DEVICE, compute_type=COMPUTE_TYPE)
41
+ print("✅ Model ready!")
42
+
43
+ # === FUNCTION ===
44
  def transcribe(file_path):
45
  # --- Ensure proper audio format ---
46
  wav, sr = torchaudio.load(file_path)
 
52
  torchaudio.save(fixed_path, wav, 16000)
53
 
54
  # --- Transcribe ---
55
+ print("🎧 Starting transcription…")
56
  segments, info = model.transcribe(
57
  fixed_path,
58
  beam_size=5,
59
  word_timestamps=True,
60
+ vad_filter=True,
61
+ suppress_silence=True,
62
  )
63
 
64
  # --- Build transcript list ---
65
  transcript = []
66
  for seg in segments:
67
  for w in seg.words:
68
+ word = w.word.strip()
69
  transcript.append({
70
+ "word": word,
71
  "start": w.start,
72
+ "end": w.end,
73
+ "explicit": word.lower() in BAD_WORDS
74
  })
75
 
76
  if not transcript:
77
+ transcript = [{
78
+ "text": seg.text,
79
+ "start": seg.start,
80
+ "end": seg.end,
81
+ "explicit": False
82
+ } for seg in segments]
83
 
84
+ print(f"✅ Transcribed {len(transcript)} words "
85
+ f"({sum(1 for w in transcript if w['explicit'])} explicit).")
86
  return transcript
87
 
88
 
89
  iface = gr.Interface(
90
  fn=transcribe,
91
  inputs=gr.Audio(type="filepath", label="Upload Vocals"),
92
+ outputs=gr.JSON(label="Transcript with Explicit Flags"),
93
  title="CleanSong AI — Whisper Transcriber (Faster-Whisper Large-V3)",
94
+ description="Transcribes vocals with per-word timestamps and explicit-word flags "
95
+ "(auto-updated bad-word list)."
96
  )
97
 
98
  if __name__ == "__main__":