Spaces:

Curify
/

studio_V1

Sleeping

App Files Files Community

qqwjq1981 commited on Apr 2, 2025

Commit

28c6cdd

verified ·

1 Parent(s): 2081360

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -28

app.py CHANGED Viewed

@@ -126,34 +126,59 @@ def handle_feedback(feedback):
             conn.commit()
         return "Thank you for your feedback!", None
-def segment_background_audio(audio_path, output_path="background_segments.wav"):
-    # Step 2: Initialize pyannote voice activity detection pipeline (you need Hugging Face token)
-    return True
-    # pipeline = Pipeline.from_pretrained(
-    #     "pyannote/voice-activity-detection",
-    #     use_auth_token=hf_api_key
-    # )
-    # # Step 3: Run VAD to get speech segments
     # vad_result = pipeline(audio_path)
-    # print(f"Detected speech segments: {vad_result}")
-    # # Step 4: Load full audio and subtract speech segments
     # full_audio = AudioSegment.from_wav(audio_path)
-    # background_audio = AudioSegment.silent(duration=len(full_audio))
     # for segment in vad_result.itersegments():
-    #     start_ms = int(segment.start * 1000)
-    #     end_ms = int(segment.end * 1000)
-    #     # Remove speech by muting that portion
-    #     background_audio = background_audio.overlay(AudioSegment.silent(duration=end_ms - start_ms), position=start_ms)
-    # # Step 5: Subtract background_audio from full_audio
-    # result_audio = full_audio.overlay(background_audio)
-    # # Step 6: Export non-speech segments
-    # result_audio.export(output_path, format="wav")
-    # print(f"Saved non-speech (background) audio to: {output_path}")
 def transcribe_video_with_speakers(video_path):
@@ -475,14 +500,9 @@ def add_transcript_voiceover(video_path, translated_json, output_path, add_voice
     final_video = CompositeVideoClip([video] + text_clips)
-    if add_voiceover:
-        if audio_segments:
-            final_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
-            final_video = final_video.set_audio(final_audio)
-        else:
-            logger.warning("⚠️ No audio segments available. Adding silent fallback.")
-            silent_audio = AudioClip(lambda t: 0, duration=video.duration)
-            final_video = final_video.set_audio(silent_audio)
     logger.info(f"Saving the final video to: {output_path}")
     final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")

             conn.commit()
         return "Thank you for your feedback!", None
+def segment_background_audio(audio_path, output_path="background_segments.wav", hf_token=None):
+    return 10
+    # """
+    # Detects and extracts non-speech (background) segments from audio using pyannote VAD.
+    # Parameters:
+    # - audio_path (str): Path to input audio (.wav).
+    # - output_path (str): Path to save the output non-speech audio.
+    # - hf_token (str): Hugging Face auth token for pyannote.
+    # Returns:
+    # - List of non-speech timestamp tuples (start, end) in seconds.
+    # """
+    # if not hf_token:
+    #     raise ValueError("Hugging Face token is required for pyannote pipeline.")
+    # # Step 1: Load pipeline
+    # pipeline = Pipeline.from_pretrained("pyannote/voice-activity-detection", use_auth_token=hf_token)
+    # # Step 2: Apply VAD to get speech segments
     # vad_result = pipeline(audio_path)
+    # print("✅ Speech segments detected.")
+    # # Step 3: Get full duration of the audio
     # full_audio = AudioSegment.from_wav(audio_path)
+    # full_duration_sec = len(full_audio) / 1000.0
+    # # Step 4: Compute non-speech segments
+    # background_segments = []
+    # current_time = 0.0
     # for segment in vad_result.itersegments():
+    #     if current_time < segment.start:
+    #         background_segments.append((current_time, segment.start))
+    #     current_time = segment.end
+    # if current_time < full_duration_sec:
+    #     background_segments.append((current_time, full_duration_sec))
+    # print(f"🕒 Non-speech segments: {background_segments}")
+    # # Step 5: Extract and combine non-speech segments
+    # non_speech_audio = AudioSegment.empty()
+    # for start, end in background_segments:
+    #     segment = full_audio[int(start * 1000):int(end * 1000)]
+    #     non_speech_audio += segment
+    # # Step 6: Export the non-speech audio
+    # non_speech_audio.export(output_path, format="wav")
+    # print(f"🎵 Non-speech audio saved to: {output_path}")
+    # return background_segments
 def transcribe_video_with_speakers(video_path):
     final_video = CompositeVideoClip([video] + text_clips)
+    if add_voiceover and audio_segments:
+        final_audio = CompositeAudioClip(audio_segments).set_duration(video.duration)
+        final_video = final_video.set_audio(final_audio)
     logger.info(f"Saving the final video to: {output_path}")
     final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")