Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -125,7 +125,7 @@ def handle_feedback(feedback):
|
|
| 125 |
conn.commit()
|
| 126 |
return "Thank you for your feedback!", None
|
| 127 |
|
| 128 |
-
def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
| 129 |
"""
|
| 130 |
Uses Demucs to separate audio and extract background (non-vocal) parts.
|
| 131 |
Merges drums, bass, and other stems into a single background track.
|
|
@@ -142,6 +142,7 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
|
|
| 142 |
stem_dir = os.path.join("separated", "htdemucs", filename)
|
| 143 |
|
| 144 |
# Step 3: Load and merge background stems
|
|
|
|
| 145 |
drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
|
| 146 |
bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
|
| 147 |
other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
|
|
@@ -150,7 +151,8 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
|
|
| 150 |
|
| 151 |
# Step 4: Export the merged background
|
| 152 |
background.export(background_audio_path, format="wav")
|
| 153 |
-
|
|
|
|
| 154 |
|
| 155 |
# def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
| 156 |
# pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
|
|
@@ -186,7 +188,7 @@ def transcribe_video_with_speakers(video_path):
|
|
| 186 |
video.audio.write_audiofile(audio_path)
|
| 187 |
logger.info(f"Audio extracted from video: {audio_path}")
|
| 188 |
|
| 189 |
-
segment_result = segment_background_audio(audio_path)
|
| 190 |
print(f"Saved non-speech (background) audio to local")
|
| 191 |
|
| 192 |
# Set up device
|
|
@@ -199,7 +201,7 @@ def transcribe_video_with_speakers(video_path):
|
|
| 199 |
logger.info("WhisperX model loaded")
|
| 200 |
|
| 201 |
# Transcribe
|
| 202 |
-
result = model.transcribe(
|
| 203 |
logger.info("Audio transcription completed")
|
| 204 |
|
| 205 |
# Get the detected language
|
|
@@ -207,12 +209,12 @@ def transcribe_video_with_speakers(video_path):
|
|
| 207 |
logger.debug(f"Detected language: {detected_language}")
|
| 208 |
# Alignment
|
| 209 |
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
| 210 |
-
result = whisperx.align(result["segments"], model_a, metadata,
|
| 211 |
logger.info("Transcription alignment completed")
|
| 212 |
|
| 213 |
# Diarization (works independently of Whisper model size)
|
| 214 |
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
|
| 215 |
-
diarize_segments = diarize_model(
|
| 216 |
logger.info("Speaker diarization completed")
|
| 217 |
|
| 218 |
# Assign speakers
|
|
@@ -243,7 +245,7 @@ def transcribe_video_with_speakers(video_path):
|
|
| 243 |
|
| 244 |
# Collapse and truncate speaker audio
|
| 245 |
speaker_sample_paths = {}
|
| 246 |
-
audio_clip = AudioFileClip(
|
| 247 |
for speaker, segments in speaker_audio.items():
|
| 248 |
speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
|
| 249 |
combined_clip = concatenate_audioclips(speaker_clips)
|
|
@@ -256,7 +258,7 @@ def transcribe_video_with_speakers(video_path):
|
|
| 256 |
# Clean up
|
| 257 |
video.close()
|
| 258 |
audio_clip.close()
|
| 259 |
-
os.remove(
|
| 260 |
|
| 261 |
return transcript_with_speakers, detected_language
|
| 262 |
|
|
|
|
| 125 |
conn.commit()
|
| 126 |
return "Thank you for your feedback!", None
|
| 127 |
|
| 128 |
+
def segment_background_audio(audio_path, background_audio_path="background_segments.wav", speech_audio_path="speech_segment.wav"):
|
| 129 |
"""
|
| 130 |
Uses Demucs to separate audio and extract background (non-vocal) parts.
|
| 131 |
Merges drums, bass, and other stems into a single background track.
|
|
|
|
| 142 |
stem_dir = os.path.join("separated", "htdemucs", filename)
|
| 143 |
|
| 144 |
# Step 3: Load and merge background stems
|
| 145 |
+
vocals = AudioSegment.from_wav(os.path.join(stem_dir, "vocals.wav"))
|
| 146 |
drums = AudioSegment.from_wav(os.path.join(stem_dir, "drums.wav"))
|
| 147 |
bass = AudioSegment.from_wav(os.path.join(stem_dir, "bass.wav"))
|
| 148 |
other = AudioSegment.from_wav(os.path.join(stem_dir, "other.wav"))
|
|
|
|
| 151 |
|
| 152 |
# Step 4: Export the merged background
|
| 153 |
background.export(background_audio_path, format="wav")
|
| 154 |
+
vocals.export(speech_audio_path, format="wav")
|
| 155 |
+
return background_audio_path, speech_audio_path
|
| 156 |
|
| 157 |
# def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
|
| 158 |
# pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
|
|
|
|
| 188 |
video.audio.write_audiofile(audio_path)
|
| 189 |
logger.info(f"Audio extracted from video: {audio_path}")
|
| 190 |
|
| 191 |
+
segment_result, speech_audio_path = segment_background_audio(audio_path)
|
| 192 |
print(f"Saved non-speech (background) audio to local")
|
| 193 |
|
| 194 |
# Set up device
|
|
|
|
| 201 |
logger.info("WhisperX model loaded")
|
| 202 |
|
| 203 |
# Transcribe
|
| 204 |
+
result = model.transcribe(speech_audio_path, chunk_size=6, print_progress = True)
|
| 205 |
logger.info("Audio transcription completed")
|
| 206 |
|
| 207 |
# Get the detected language
|
|
|
|
| 209 |
logger.debug(f"Detected language: {detected_language}")
|
| 210 |
# Alignment
|
| 211 |
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
|
| 212 |
+
result = whisperx.align(result["segments"], model_a, metadata, speech_audio_path, device)
|
| 213 |
logger.info("Transcription alignment completed")
|
| 214 |
|
| 215 |
# Diarization (works independently of Whisper model size)
|
| 216 |
diarize_model = whisperx.DiarizationPipeline(use_auth_token=hf_api_key, device=device)
|
| 217 |
+
diarize_segments = diarize_model(speech_audio_path)
|
| 218 |
logger.info("Speaker diarization completed")
|
| 219 |
|
| 220 |
# Assign speakers
|
|
|
|
| 245 |
|
| 246 |
# Collapse and truncate speaker audio
|
| 247 |
speaker_sample_paths = {}
|
| 248 |
+
audio_clip = AudioFileClip(speech_audio_path)
|
| 249 |
for speaker, segments in speaker_audio.items():
|
| 250 |
speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
|
| 251 |
combined_clip = concatenate_audioclips(speaker_clips)
|
|
|
|
| 258 |
# Clean up
|
| 259 |
video.close()
|
| 260 |
audio_clip.close()
|
| 261 |
+
os.remove(speech_audio_path)
|
| 262 |
|
| 263 |
return transcript_with_speakers, detected_language
|
| 264 |
|