Update app.py
Browse files
app.py
CHANGED
|
@@ -33,6 +33,8 @@ import traceback
|
|
| 33 |
from TTS.api import TTS
|
| 34 |
import torch
|
| 35 |
from TTS.tts.configs.xtts_config import XttsConfig
|
|
|
|
|
|
|
| 36 |
|
| 37 |
# Accept license terms for Coqui XTTS
|
| 38 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
|
@@ -128,12 +130,47 @@ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %
|
|
| 128 |
logger = logging.getLogger(__name__)
|
| 129 |
logger.info(f"MoviePy Version: {moviepy.__version__}")
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
def transcribe_video_with_speakers(video_path):
|
| 132 |
# Extract audio from video
|
| 133 |
video = VideoFileClip(video_path)
|
| 134 |
audio_path = "audio.wav"
|
| 135 |
video.audio.write_audiofile(audio_path)
|
| 136 |
logger.info(f"Audio extracted from video: {audio_path}")
|
|
|
|
|
|
|
|
|
|
| 137 |
|
| 138 |
# Set up device
|
| 139 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 33 |
from TTS.api import TTS
|
| 34 |
import torch
|
| 35 |
from TTS.tts.configs.xtts_config import XttsConfig
|
| 36 |
+
from pydub import AudioSegment
|
| 37 |
+
from pyannote.audio import Pipeline
|
| 38 |
|
| 39 |
# Accept license terms for Coqui XTTS
|
| 40 |
os.environ["COQUI_TOS_AGREED"] = "1"
|
|
|
|
| 130 |
logger = logging.getLogger(__name__)
|
| 131 |
logger.info(f"MoviePy Version: {moviepy.__version__}")
|
| 132 |
|
| 133 |
+
def segment_background_audio(audio_path, output_path="background_segments.wav"):
|
| 134 |
+
|
| 135 |
+
# Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
|
| 136 |
+
|
| 137 |
+
pipeline = Pipeline.from_pretrained(
|
| 138 |
+
"pyannote/voice-activity-detection",
|
| 139 |
+
use_auth_token=HUGGINGFACE_TOKEN
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
# Step 3: Run VAD to get speech segments
|
| 143 |
+
vad_result = pipeline(audio_path)
|
| 144 |
+
print(f"Detected speech segments: {vad_result}")
|
| 145 |
+
|
| 146 |
+
# Step 4: Load full audio and subtract speech segments
|
| 147 |
+
full_audio = AudioSegment.from_wav(audio_path)
|
| 148 |
+
background_audio = AudioSegment.silent(duration=len(full_audio))
|
| 149 |
+
|
| 150 |
+
for segment in vad_result.itersegments():
|
| 151 |
+
start_ms = int(segment.start * 1000)
|
| 152 |
+
end_ms = int(segment.end * 1000)
|
| 153 |
+
# Remove speech by muting that portion
|
| 154 |
+
background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
|
| 155 |
+
|
| 156 |
+
# Step 5: Subtract background_audio from full_audio
|
| 157 |
+
result_audio = full_audio.overlay(background_audio)
|
| 158 |
+
|
| 159 |
+
# Step 6: Export non-speech segments
|
| 160 |
+
result_audio.export(output_path, format="wav")
|
| 161 |
+
print(f"Saved non-speech (background) audio to: {output_path}")
|
| 162 |
+
|
| 163 |
+
return output_path
|
| 164 |
+
|
| 165 |
def transcribe_video_with_speakers(video_path):
|
| 166 |
# Extract audio from video
|
| 167 |
video = VideoFileClip(video_path)
|
| 168 |
audio_path = "audio.wav"
|
| 169 |
video.audio.write_audiofile(audio_path)
|
| 170 |
logger.info(f"Audio extracted from video: {audio_path}")
|
| 171 |
+
|
| 172 |
+
segment_background_audio(audio_path)
|
| 173 |
+
print(f"Saved non-speech (background) audio to local")
|
| 174 |
|
| 175 |
# Set up device
|
| 176 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|