Update app.py
Browse files
app.py
CHANGED
|
@@ -128,32 +128,33 @@ def handle_feedback(feedback):
|
|
| 128 |
|
| 129 |
def segment_background_audio(audio_path, output_path="background_segments.wav"):
|
| 130 |
# Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
|
| 131 |
-
pipeline = Pipeline.from_pretrained(
|
| 132 |
-
"pyannote/voice-activity-detection",
|
| 133 |
-
use_auth_token=hf_api_key
|
| 134 |
-
)
|
| 135 |
-
# Step 3: Run VAD to get speech segments
|
| 136 |
-
vad_result = pipeline(audio_path)
|
| 137 |
-
print(f"Detected speech segments: {vad_result}")
|
| 138 |
-
|
| 139 |
-
# Step 4: Load full audio and subtract speech segments
|
| 140 |
-
full_audio = AudioSegment.from_wav(audio_path)
|
| 141 |
-
background_audio = AudioSegment.silent(duration=len(full_audio))
|
| 142 |
-
|
| 143 |
-
for segment in vad_result.itersegments():
|
| 144 |
-
start_ms = int(segment.start * 1000)
|
| 145 |
-
end_ms = int(segment.end * 1000)
|
| 146 |
-
# Remove speech by muting that portion
|
| 147 |
-
background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
|
| 148 |
-
|
| 149 |
-
# Step 5: Subtract background_audio from full_audio
|
| 150 |
-
result_audio = full_audio.overlay(background_audio)
|
| 151 |
-
|
| 152 |
-
# Step 6: Export non-speech segments
|
| 153 |
-
result_audio.export(output_path, format="wav")
|
| 154 |
-
print(f"Saved non-speech (background) audio to: {output_path}")
|
| 155 |
-
|
| 156 |
return True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
def transcribe_video_with_speakers(video_path):
|
| 159 |
# Extract audio from video
|
|
|
|
| 128 |
|
| 129 |
def segment_background_audio(audio_path, output_path="background_segments.wav"):
|
| 130 |
# Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
return True
|
| 132 |
+
|
| 133 |
+
# pipeline = Pipeline.from_pretrained(
|
| 134 |
+
# "pyannote/voice-activity-detection",
|
| 135 |
+
# use_auth_token=hf_api_key
|
| 136 |
+
# )
|
| 137 |
+
# # Step 3: Run VAD to get speech segments
|
| 138 |
+
# vad_result = pipeline(audio_path)
|
| 139 |
+
# print(f"Detected speech segments: {vad_result}")
|
| 140 |
+
|
| 141 |
+
# # Step 4: Load full audio and subtract speech segments
|
| 142 |
+
# full_audio = AudioSegment.from_wav(audio_path)
|
| 143 |
+
# background_audio = AudioSegment.silent(duration=len(full_audio))
|
| 144 |
+
|
| 145 |
+
# for segment in vad_result.itersegments():
|
| 146 |
+
# start_ms = int(segment.start * 1000)
|
| 147 |
+
# end_ms = int(segment.end * 1000)
|
| 148 |
+
# # Remove speech by muting that portion
|
| 149 |
+
# background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
|
| 150 |
+
|
| 151 |
+
# # Step 5: Subtract background_audio from full_audio
|
| 152 |
+
# result_audio = full_audio.overlay(background_audio)
|
| 153 |
+
|
| 154 |
+
# # Step 6: Export non-speech segments
|
| 155 |
+
# result_audio.export(output_path, format="wav")
|
| 156 |
+
# print(f"Saved non-speech (background) audio to: {output_path}")
|
| 157 |
+
|
| 158 |
|
| 159 |
def transcribe_video_with_speakers(video_path):
|
| 160 |
# Extract audio from video
|