Spaces:

Curify
/

studio_V1

Sleeping

App Files Files Community

qqwjq1981 commited on Apr 7, 2025

Commit

32a25f2

verified ·

1 Parent(s): 7963262

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -40

app.py CHANGED Viewed

@@ -126,55 +126,31 @@ def handle_feedback(feedback):
         return "Thank you for your feedback!", None
 def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
-    """
-    Detects and extracts non-speech (background) segments from audio using pyannote VAD.
-    Parameters:
-    - audio_path (str): Path to input audio (.wav).
-    - segment_audio_path (str): Path to save the output non-speech audio.
-    - hf_token (str): Hugging Face auth token for pyannote.
-    Returns:
-    - List of non-speech timestamp tuples (start, end) in seconds.
-    """
-    # Step 1: Load pipeline
     pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
-    # Step 2: Apply VAD to get speech segments
     vad_result = pipeline(audio_path)
-    print("✅ Speech segments detected.")
-    # Step 3: Get full duration of the audio
     full_audio = AudioSegment.from_wav(audio_path)
     full_duration_sec = len(full_audio) / 1000.0
-    # Step 4: Compute non-speech segments
-    background_segments = []
     current_time = 0.0
     for segment in vad_result.itersegments():
         if current_time < segment.start:
-            background_segments.append((current_time, segment.start))
         current_time = segment.end
     if current_time < full_duration_sec:
-        background_segments.append((current_time, full_duration_sec))
-    print(f"🕒 Non-speech segments: {background_segments}")
-    # Step 5: Extract and combine non-speech segments
-    non_speech_audio = AudioSegment.empty()
-    for start, end in background_segments:
-        segment = full_audio[int(start * 1000):int(end * 1000)]
-        non_speech_audio += segment
-    # Step 6: Export the non-speech audio
-    non_speech_audio.export(background_audio_path, format="wav")
-    print(f"🎵 Non-speech audio saved to: {background_audio_path}")
-    return background_segments
 def transcribe_video_with_speakers(video_path):
     # Extract audio from video
@@ -427,9 +403,11 @@ def process_entry(entry, i, tts_model, video_width, video_height, process_mode,
             speaker = entry.get("speaker", "default")
             speaker_wav_path = f"speaker_{speaker}_sample.wav"
-            if process_mode>2 and speaker_wav_path and os.path.exists(speaker_wav_path):
                 generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
             else:
                 generate_voiceover_OpenAI(entry['translated'], target_language, desired_speed, segment_audio_path)
@@ -504,9 +482,9 @@ def add_transcript_voiceover(video_path, translated_json, output_path, process_m
             voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
             if background_audio_path and os.path.exists(background_audio_path):
-                # background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
-                # final_audio = CompositeAudioClip([voice_audio, background_audio])
-                final_audio = voice_audio
                 logger.info("✅ Background audio loaded and merged with voiceover.")
             else:
                 final_audio = voice_audio

         return "Thank you for your feedback!", None
 def segment_background_audio(audio_path, background_audio_path="background_segments.wav"):
     pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_api_key)
     vad_result = pipeline(audio_path)
     full_audio = AudioSegment.from_wav(audio_path)
     full_duration_sec = len(full_audio) / 1000.0
     current_time = 0.0
+    result_audio = AudioSegment.empty()
     for segment in vad_result.itersegments():
+        # Background segment before the speech
         if current_time < segment.start:
+            bg = full_audio[int(current_time * 1000):int(segment.start * 1000)]
+            result_audio += bg
+        # Add silence for the speech duration
+        silence_duration = segment.end - segment.start
+        result_audio += AudioSegment.silent(duration=int(silence_duration * 1000))
         current_time = segment.end
+    # Handle any remaining background after the last speech
     if current_time < full_duration_sec:
+        result_audio += full_audio[int(current_time * 1000):]
+    result_audio.export(background_audio_path, format="wav")
+    return background_audio_path
 def transcribe_video_with_speakers(video_path):
     # Extract audio from video
             speaker = entry.get("speaker", "default")
             speaker_wav_path = f"speaker_{speaker}_sample.wav"
+            # Assume this is the list of supported languages for the TTS model
+            supported_languages = tts_model.synthesizer.tts_model.language_manager.name_to_id.keys()
+            if process_mode > 2 and speaker_wav_path and os.path.exists(speaker_wav_path) and target_language in supported_languages:
                 generate_voiceover_clone(entry['translated'], tts_model, desired_speed, target_language, speaker_wav_path, segment_audio_path)
             else:
                 generate_voiceover_OpenAI(entry['translated'], target_language, desired_speed, segment_audio_path)
             voice_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
             if background_audio_path and os.path.exists(background_audio_path):
+                background_audio = AudioFileClip(background_audio_path).set_duration(video.duration)
+                final_audio = CompositeAudioClip([voice_audio, background_audio])
+                # final_audio = voice_audio
                 logger.info("✅ Background audio loaded and merged with voiceover.")
             else:
                 final_audio = voice_audio