Spaces:

Krokodilpirat
/

Video-Depth-Anything_RGBD_Zero

Sleeping

App Files Files Community

Krokodilpirat commited on Jul 12

Commit

d6483ee

verified ·

1 Parent(s): c245745

Update app.py

Browse files

Files changed (1) hide show

app.py +3 -377

app.py CHANGED Viewed

@@ -13,8 +13,6 @@ from video_depth_anything.video_depth import VideoDepthAnything
 from utils.dc_utils import read_video_frames, save_video
 from transformers import BlipProcessor, BlipForConditionalGeneration
 from PIL import Image
-import tempfile
-import shutil
 # --- Environment setup ---
 os.environ["HF_HOME"] = "/tmp/huggingface"
@@ -47,55 +45,6 @@ print("Loading BLIP model...")
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
-# --- Load MMaudio model ---
-print("Loading MMaudio model...")
-mmaudio_model = None
-try:
-    # Check if mmaudio folder exists (local installation)
-    if os.path.exists('./mmaudio'):
-        print("✅ Found local mmaudio folder")
-        # List contents to debug structure
-        import os
-        mmaudio_contents = os.listdir('./mmaudio')
-        print(f"DEBUG: mmaudio folder contents: {mmaudio_contents}")
-        # Add mmaudio to Python path
-        if './mmaudio' not in sys.path:
-            sys.path.insert(0, './mmaudio')
-            print("✅ Added mmaudio to Python path")
-        # Try different import patterns
-        try:
-            from eval_utils import generate, load_mmaudio_model
-            print("✅ MMaudio eval_utils imported successfully")
-            # Load model
-            device = 'cuda' if torch.cuda.is_available() else 'cpu'
-            model_name = "large_44k_v2"
-            mmaudio_model = load_mmaudio_model(
-                model_name=model_name,
-                device=device
-            )
-            print(f"✅ MMaudio {model_name} loaded on {device}")
-        except Exception as load_error:
-            print(f"❌ MMaudio model loading failed: {load_error}")
-            import traceback
-            traceback.print_exc()
-            mmaudio_model = None
-    else:
-        print("⚠️ mmaudio folder not found")
-        mmaudio_model = None
-except Exception as e:
-    print(f"❌ MMaudio setup failed: {e}")
-    mmaudio_model = None
 def get_first_frame_for_blip(video_path, target_size=480):
     """Effizient: Lädt nur das erste Frame für BLIP (nicht alle Frames!)"""
     try:
@@ -176,175 +125,7 @@ def generate_blip_name(frame: np.ndarray) -> str:
         print(f"BLIP error: {e}")
         return "video"
-# --- 🎵 MMaudio Functions ---
-def get_video_duration(video_path):
-    """Get video duration in seconds using OpenCV"""
-    try:
-        cap = cv2.VideoCapture(video_path)
-        fps = cap.get(cv2.CAP_PROP_FPS)
-        frame_count = cap.get(cv2.CAP_PROP_FRAME_COUNT)
-        cap.release()
-        if fps > 0:
-            duration = frame_count / fps
-            return duration
-        else:
-            return 8.0  # Default duration
-    except Exception as e:
-        print(f"Duration detection failed: {e}")
-        return 8.0
-def generate_audio_for_video(video_path, audio_prompt, negative_prompt="", audio_seed=-1, num_inference_steps=25, guidance_scale=4.5):
-    """Generate audio for video using MMaudio (official HF Space style)"""
-    try:
-        if mmaudio_model is None:
-            raise RuntimeError("MMaudio model not loaded")
-        # Import generation utilities (like official space)
-        from mmaudio.eval_utils import generate
-        # Get video duration (MMaudio works best with 8s, but can handle longer)
-        duration = get_video_duration(video_path)
-        print(f"DEBUG: Generating audio for {duration}s duration (matching input video)")
-        # Prepare inputs like official HF Space
-        text_input = audio_prompt.strip() if audio_prompt.strip() else None
-        negative_text = negative_prompt.strip() if negative_prompt.strip() else None
-        # Handle seed (like official space)
-        if audio_seed == -1:
-            import random
-            audio_seed = random.randint(0, 2**32 - 1)
-        # Set seeds for reproducibility
-        torch.manual_seed(audio_seed)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed(audio_seed)
-        print(f"DEBUG: MMaudio generating with:")
-        print(f"  Prompt: '{text_input}'")
-        print(f"  Negative: '{negative_text}'")
-        print(f"  Seed: {audio_seed}")
-        print(f"  Steps: {num_inference_steps}")
-        print(f"  Guidance: {guidance_scale}")
-        print(f"  Duration: {duration}s")
-        # Generate audio using official MMaudio generate function
-        with torch.no_grad():
-            # This follows the official HF Space pattern
-            result = generate(
-                model=mmaudio_model,
-                video_path=video_path,
-                text=text_input,
-                negative_text=negative_text,
-                duration=duration,
-                guidance_scale=guidance_scale,
-                num_inference_steps=num_inference_steps,
-                seed=audio_seed
-            )
-        # Save generated audio to temporary file
-        temp_audio_path = tempfile.mktemp(suffix=".wav")
-        # Extract audio from result (format depends on MMaudio output)
-        if isinstance(result, dict) and 'audio' in result:
-            audio_data = result['audio']
-        else:
-            audio_data = result
-        # Convert audio to numpy and save
-        if isinstance(audio_data, torch.Tensor):
-            audio_np = audio_data.cpu().numpy()
-            # Normalize audio
-            if audio_np.max() > 0:
-                audio_np = audio_np / np.max(np.abs(audio_np))
-            import scipy.io.wavfile
-            sample_rate = 44100  # 44.1kHz
-            # Handle different audio shapes
-            if len(audio_np.shape) == 2:
-                # Stereo - take first channel or mix down
-                if audio_np.shape[0] == 2:  # (2, samples)
-                    audio_np = audio_np[0]  # Take first channel
-                elif audio_np.shape[1] == 2:  # (samples, 2)
-                    audio_np = np.mean(audio_np, axis=1)  # Mix to mono
-            # Ensure audio is 1D
-            audio_np = audio_np.flatten()
-            # Convert to int16 for WAV
-            audio_int16 = (audio_np * 32767).astype(np.int16)
-            scipy.io.wavfile.write(temp_audio_path, sample_rate, audio_int16)
-        print(f"DEBUG: Audio generated and saved to: {temp_audio_path}")
-        return temp_audio_path
-    except Exception as e:
-        print(f"❌ Audio generation failed: {e}")
-        import traceback
-        traceback.print_exc()
-        return None
-def mix_audio_with_video(video_path, audio_path, volume=0.5, replace_audio=False):
-    """Mix generated audio with video using FFmpeg"""
-    try:
-        # Create output path
-        base_name = os.path.splitext(os.path.basename(video_path))[0]
-        output_path = f"{base_name}_with_audio.mp4"
-        if replace_audio:
-            # Replace original audio completely
-            cmd = [
-                "ffmpeg", "-y",
-                "-i", video_path,
-                "-i", audio_path,
-                "-c:v", "copy",
-                "-c:a", "aac",
-                "-map", "0:v:0",
-                "-map", "1:a:0",
-                "-shortest",
-                output_path
-            ]
-        else:
-            # Mix with original audio
-            cmd = [
-                "ffmpeg", "-y",
-                "-i", video_path,
-                "-i", audio_path,
-                "-filter_complex", f"[0:a][1:a]amix=inputs=2:duration=shortest:weights=1 {volume}[a]",
-                "-map", "0:v:0",
-                "-map", "[a]",
-                "-c:v", "copy",
-                "-c:a", "aac",
-                "-shortest",
-                output_path
-            ]
-        print(f"DEBUG: Mixing audio with video: {' '.join(cmd)}")
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        if result.returncode == 0:
-            print(f"✅ Audio mixed successfully: {output_path}")
-            return output_path
-        else:
-            print(f"❌ FFmpeg mixing failed: {result.stderr}")
-            return None
-    except Exception as e:
-        print(f"❌ Audio mixing failed: {e}")
-        return None
-def cleanup_temp_files(*file_paths):
-    """Clean up temporary files"""
-    for file_path in file_paths:
-        if file_path and os.path.exists(file_path):
-            try:
-                os.remove(file_path)
-                print(f"DEBUG: Cleaned up: {file_path}")
-            except Exception as e:
-                print(f"DEBUG: Cleanup failed for {file_path}: {e}")
 def create_overlay_thumbnail(rgb_frame, depth_frame):
     """
@@ -819,79 +600,10 @@ def download_generic_video(url):
 # --- Global variables for toggling ---
 current_video_file = None
 current_video_url = None
-current_original_video = None  # Store original video before audio processing
 blip_generated_name = ""
 original_filename = ""
-# --- Audio Generation Handler ---
-def on_generate_audio(upload_video, video_url, audio_prompt, negative_prompt, audio_volume, replace_audio, audio_seed, num_inference_steps, guidance_scale):
-    """Handle audio generation for input video with full parameter control"""
-    global current_video_file, current_video_url, current_original_video
-    try:
-        # Determine input video
-        input_video = upload_video or video_url
-        if not input_video:
-            return None, "❌ No video provided for audio generation"
-        if not audio_prompt.strip():
-            return None, "❌ Please provide an audio prompt"
-        if mmaudio_model is None:
-            return None, "❌ MMaudio model not available"
-        # Store original video if not already stored
-        if current_original_video is None:
-            current_original_video = input_video
-        # Use original video for audio generation (not previous audio version)
-        print(f"DEBUG: Generating audio for: {current_original_video}")
-        # Generate audio with all parameters
-        audio_path = generate_audio_for_video(
-            current_original_video,
-            audio_prompt,
-            negative_prompt,
-            audio_seed,
-            num_inference_steps,
-            guidance_scale
-        )
-        if not audio_path:
-            return None, "❌ Audio generation failed"
-        # Mix audio with video
-        video_with_audio = mix_audio_with_video(current_original_video, audio_path, audio_volume, replace_audio)
-        if not video_with_audio:
-            cleanup_temp_files(audio_path)
-            return None, "❌ Audio mixing failed"
-        # Update current video to the new version with audio
-        if upload_video:
-            current_video_file = video_with_audio
-        else:
-            current_video_url = video_with_audio
-        # Cleanup temporary audio file
-        cleanup_temp_files(audio_path)
-        # Build success message with parameters
-        success_msg = f"✅ Audio generated successfully!"
-        if replace_audio:
-            success_msg += " (Original audio replaced)"
-        else:
-            success_msg += f" (Mixed at {audio_volume*100:.0f}% volume)"
-        success_msg += f"<br>🎛️ Steps: {num_inference_steps}, Guidance: {guidance_scale}"
-        if audio_seed != -1:
-            success_msg += f", Seed: {audio_seed}"
-        print(f"DEBUG: Audio generation completed: {video_with_audio}")
-        return video_with_audio, success_msg
-    except Exception as e:
-        error_msg = f"❌ Audio generation error: {str(e)}"
-        print(error_msg)
-        return None, error_msg
 def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, create_thumbnail, *args):
     """Process video to generate depth maps and RGBD output - NO FALLBACK THUMBNAIL"""
     try:
@@ -1305,85 +1017,9 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
             width=180,
             interactive=False,
             show_label=True,
-            scale=1,
-            format="jpeg"  # Erzwinge JPEG für Downloads
         )
-    # MMaudio Integration
-    with gr.Accordion("🎵 Audio Generation (MMaudio)", open=False):
-        with gr.Row():
-            enable_mmaudio = gr.Checkbox(
-                label="Generate Audio",
-                value=False,
-                info="Generate audio track from video content using MMaudio Large V2",
-                scale=1
-            )
-            audio_prompt = gr.Textbox(
-                label="Audio Prompt",
-                placeholder="Describe the desired audio (e.g. 'ocean waves', 'forest sounds', 'city traffic', 'epic cinematic music')",
-                scale=4,
-                lines=2
-            )
-        with gr.Row():
-            negative_prompt = gr.Textbox(
-                label="Negative Prompt",
-                placeholder="What to avoid in audio (e.g. 'music', 'voices', 'loud noises')",
-                scale=3,
-                lines=1
-            )
-            audio_volume = gr.Slider(
-                label="Audio Volume",
-                minimum=0.0,
-                maximum=1.0,
-                value=0.5,
-                step=0.1,
-                info="Mix volume with original audio",
-                scale=2
-            )
-            replace_audio = gr.Checkbox(
-                label="Replace Original Audio",
-                value=False,
-                info="Replace instead of mixing",
-                scale=1
-            )
-        with gr.Row():
-            audio_seed = gr.Number(
-                label="Seed (-1: random)",
-                value=-1,
-                precision=0,
-                info="Seed for reproducible audio generation",
-                scale=1
-            )
-            num_inference_steps = gr.Slider(
-                label="Num Steps",
-                minimum=10,
-                maximum=50,
-                value=25,
-                step=1,
-                info="More steps = better quality, slower generation",
-                scale=2
-            )
-            guidance_scale = gr.Slider(
-                label="Guidance Strength",
-                minimum=1.0,
-                maximum=10.0,
-                value=4.5,
-                step=0.5,
-                info="How closely to follow the prompt",
-                scale=2
-            )
-            generate_audio_btn = gr.Button(
-                "🎵 Generate Audio",
-                variant="secondary",
-                size="sm",
-                scale=1
-            )
-        audio_duration_info = gr.HTML("ℹ️ Audio duration will automatically match input video length")
-        audio_status = gr.HTML("")
     # Event handlers for input changes
     video_url.change(
         fn=on_video_url_change,
@@ -1406,13 +1042,6 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
         outputs=[filename, status_display]
     )
-    # Audio generation event
-    generate_audio_btn.click(
-        fn=on_generate_audio,
-        inputs=[upload_video, video_url, audio_prompt, negative_prompt, audio_volume, replace_audio, audio_seed, num_inference_steps, guidance_scale],
-        outputs=[upload_video, audio_status]
-    )
     with gr.Accordion("⚙️ Advanced Settings", open=False):
         with gr.Row():
             max_len = gr.Slider(
@@ -1485,9 +1114,6 @@ with gr.Blocks(analytics_enabled=False, title="Video Depth Anything") as demo:
     - **RGBD output**: Side-by-side comparison of original and depth
     - **Thumbnail Preview**: Shows final RGB→Depth gradient after processing
     - **Embedded Thumbnails**: Videos will show previews in Windows Explorer
-    - **Audio Generation**: Use MMaudio Large V2 (44kHz) for high-quality audio synthesis
-    - **Audio Prompts**: Be descriptive (e.g. "gentle ocean waves with seagulls", "epic orchestral music")
-    - **Iterative Audio**: Generate multiple times with different prompts to perfect the audio
     - **Processing time**: Depends on video length and resolution
     - **Filename**: Set your preferred name before clicking Generate!
     """)

 from utils.dc_utils import read_video_frames, save_video
 from transformers import BlipProcessor, BlipForConditionalGeneration
 from PIL import Image
 # --- Environment setup ---
 os.environ["HF_HOME"] = "/tmp/huggingface"
 blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
 blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")
 def get_first_frame_for_blip(video_path, target_size=480):
     """Effizient: Lädt nur das erste Frame für BLIP (nicht alle Frames!)"""
     try:
         print(f"BLIP error: {e}")
         return "video"
+# --- 🎨 Thumbnail Generation Functions ---
 def create_overlay_thumbnail(rgb_frame, depth_frame):
     """
 # --- Global variables for toggling ---
 current_video_file = None
 current_video_url = None
 blip_generated_name = ""
 original_filename = ""
+# --- MAIN INFERENCE FUNCTION - NO FALLBACK THUMBNAIL ---
 def infer_video_depth_from_source(upload_video, video_url, filename, use_blip, create_thumbnail, *args):
     """Process video to generate depth maps and RGBD output - NO FALLBACK THUMBNAIL"""
     try:
             width=180,
             interactive=False,
             show_label=True,
+            scale=1
         )
     # Event handlers for input changes
     video_url.change(
         fn=on_video_url_change,
         outputs=[filename, status_display]
     )
     with gr.Accordion("⚙️ Advanced Settings", open=False):
         with gr.Row():
             max_len = gr.Slider(
     - **RGBD output**: Side-by-side comparison of original and depth
     - **Thumbnail Preview**: Shows final RGB→Depth gradient after processing
     - **Embedded Thumbnails**: Videos will show previews in Windows Explorer
     - **Processing time**: Depends on video length and resolution
     - **Filename**: Set your preferred name before clicking Generate!
     """)