Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,8 +34,11 @@ from TTS.api import TTS
|
|
| 34 |
import torch
|
| 35 |
from pydub import AudioSegment
|
| 36 |
from pyannote.audio import Pipeline
|
| 37 |
-
import traceback
|
| 38 |
import wave
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
logger = logging.getLogger(__name__)
|
| 41 |
|
|
@@ -154,32 +157,12 @@ def segment_background_audio(audio_path, background_audio_path="background_segme
|
|
| 154 |
vocals.export(speech_audio_path, format="wav")
|
| 155 |
return background_audio_path, speech_audio_path
|
| 156 |
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
# current_time = 0.0
|
| 165 |
-
# result_audio = AudioSegment.empty()
|
| 166 |
-
|
| 167 |
-
# for segment in vad_result.itersegments():
|
| 168 |
-
# # Background segment before the speech
|
| 169 |
-
# if current_time < segment.start:
|
| 170 |
-
# bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
|
| 171 |
-
# result_audio += bg
|
| 172 |
-
# # Add silence for the speech duration
|
| 173 |
-
# silence_duration = segment.end - segment.start
|
| 174 |
-
# result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
|
| 175 |
-
# current_time = segment.end
|
| 176 |
-
|
| 177 |
-
# # Handle any remaining background after the last speech
|
| 178 |
-
# if current_time < full_duration_sec:
|
| 179 |
-
# result_audio += full_audio[int(current_time * 1000):]
|
| 180 |
-
|
| 181 |
-
# result_audio.export(background_audio_path, format="wav")
|
| 182 |
-
# return background_audio_path
|
| 183 |
|
| 184 |
def transcribe_video_with_speakers(video_path):
|
| 185 |
# Extract audio from video
|
|
@@ -250,8 +233,22 @@ def transcribe_video_with_speakers(video_path):
|
|
| 250 |
speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
|
| 251 |
combined_clip = concatenate_audioclips(speaker_clips)
|
| 252 |
truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
|
| 253 |
-
|
| 254 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
speaker_sample_paths[speaker] = sample_path
|
| 256 |
logger.info(f"Created sample for {speaker}: {sample_path}")
|
| 257 |
|
|
|
|
| 34 |
import torch
|
| 35 |
from pydub import AudioSegment
|
| 36 |
from pyannote.audio import Pipeline
|
|
|
|
| 37 |
import wave
|
| 38 |
+
import librosa
|
| 39 |
+
import noisereduce as nr
|
| 40 |
+
import soundfile as sf
|
| 41 |
+
|
| 42 |
|
| 43 |
logger = logging.getLogger(__name__)
|
| 44 |
|
|
|
|
| 157 |
vocals.export(speech_audio_path, format="wav")
|
| 158 |
return background_audio_path, speech_audio_path
|
| 159 |
|
| 160 |
+
def denoise_audio_array(audio_array, sr=16000):
|
| 161 |
+
"""
|
| 162 |
+
Denoise an audio numpy array directly.
|
| 163 |
+
"""
|
| 164 |
+
y_denoised = nr.reduce_noise(y=audio_array, sr=sr)
|
| 165 |
+
return y_denoised
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
|
| 167 |
def transcribe_video_with_speakers(video_path):
|
| 168 |
# Extract audio from video
|
|
|
|
| 233 |
speaker_clips = [audio_clip.subclip(start, end) for start, end in segments]
|
| 234 |
combined_clip = concatenate_audioclips(speaker_clips)
|
| 235 |
truncated_clip = combined_clip.subclip(0, min(30, combined_clip.duration))
|
| 236 |
+
|
| 237 |
+
# Step 1: Get audio array from the clip
|
| 238 |
+
fps = 16000 # target sampling rate
|
| 239 |
+
audio_array = truncated_clip.to_soundarray(fps=fps)
|
| 240 |
+
|
| 241 |
+
# If stereo → convert to mono
|
| 242 |
+
if audio_array.ndim == 2:
|
| 243 |
+
audio_array = np.mean(audio_array, axis=1)
|
| 244 |
+
|
| 245 |
+
# Step 2: Apply denoising
|
| 246 |
+
denoised_audio_array = denoise_audio_array(audio_array, sr=fps)
|
| 247 |
+
|
| 248 |
+
# Step 3: Save denoised audio directly
|
| 249 |
+
clean_sample_path = f"speaker_{speaker}_sample.wav"
|
| 250 |
+
sf.write(clean_sample_path, denoised_audio_array, fps)
|
| 251 |
+
|
| 252 |
speaker_sample_paths[speaker] = sample_path
|
| 253 |
logger.info(f"Created sample for {speaker}: {sample_path}")
|
| 254 |
|