Spaces:

vclmax
/

Element-8-Video

Running on Zero

App Files Files Community

Vicente Alvarez commited on 3 days ago

Commit

dcbdf35

1 Parent(s): 9c32fea

Add multi-clip generation with audio looping + high res 1024x640: Generate 1-3 clips, loop to match audio duration (CPU work free)

Browse files

Files changed (2) hide show

app.py +209 -85
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -103,7 +103,7 @@ DEFAULT_FRAME_RATE = 24.0
 # Resolution presets: (width, height)
 RESOLUTIONS = {
-    "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024)},
     "low": {"16:9": (512, 320), "9:16": (320, 512), "1:1": (512, 512)},
 }
@@ -329,6 +329,73 @@ def apply_gaussian_blur(video_tensor: torch.Tensor, blur_amount: int) -> torch.T
     return blurred
 @spaces.GPU(duration=90)
 @torch.inference_mode()
 def generate_video(
@@ -344,100 +411,152 @@ def generate_video(
     negative_prompt: str = DEFAULT_NEGATIVE_PROMPT,
     blur_amount: int = 0,
     remove_music: bool = False,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
         torch.cuda.reset_peak_memory_stats()
         log_memory("start")
-        current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
-        frame_rate = DEFAULT_FRAME_RATE
-        num_frames = int(duration * frame_rate) + 1
-        num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
-        print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
-        images = []
-        output_dir = Path("outputs")
-        output_dir.mkdir(exist_ok=True)
-        if first_image is not None:
-            temp_first_path = output_dir / f"temp_first_{current_seed}.jpg"
-            if hasattr(first_image, "save"):
-                first_image.save(temp_first_path)
-            else:
-                temp_first_path = Path(first_image)
-            images.append(ImageConditioningInput(path=str(temp_first_path), frame_idx=0, strength=1.0))
-        if last_image is not None:
-            temp_last_path = output_dir / f"temp_last_{current_seed}.jpg"
-            if hasattr(last_image, "save"):
-                last_image.save(temp_last_path)
-            else:
-                temp_last_path = Path(last_image)
-            images.append(ImageConditioningInput(path=str(temp_last_path), frame_idx=num_frames - 1, strength=1.0))
-        from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
-        tiling_config = TilingConfig.default()
-        video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
-        log_memory("before pipeline call")
-        # Run inference - DistilledPipeline has simpler API
-        video_frames_iter, audio = pipeline(
-            prompt=prompt,
-            seed=current_seed,
-            height=int(height),
-            width=int(width),
-            num_frames=num_frames,
-            frame_rate=frame_rate,
-            images=images,
-            enhance_prompt=enhance_prompt,
-        )
-        # Collect video frames
-        frames = [frame for frame in video_frames_iter]
-        video_tensor = torch.cat(frames, dim=0) if len(frames) > 1 else frames[0]
-        log_memory("after pipeline call")
-        # Apply Gaussian blur if requested (for censoring/teaser effect)
-        if blur_amount > 0:
-            print(f"Applying Gaussian blur (amount={blur_amount})...")
-            video_tensor = apply_gaussian_blur(video_tensor, blur_amount)
-            log_memory("after blur")
-        output_path = tempfile.mktemp(suffix=".mp4")
-        encode_video(
-            video=video_tensor,
-            fps=frame_rate,
-            audio=audio,
-            output_path=output_path,
-            video_chunks_number=video_chunks_number,
-        )
-        log_memory("after encode_video")
-        # Remove background music if requested
-        if remove_music:
-            print(f"Removing background music with Demucs...")
-            processed_path = tempfile.mktemp(suffix=".mp4")
-            success = remove_music_demucs(output_path, processed_path)
-            if success:
-                output_path = processed_path
-                log_memory("after demucs")
-            else:
-                print(f"Warning: Music removal failed, using original video")
-        return str(output_path), current_seed
     except Exception as e:
         import traceback
         log_memory("on error")
         print(f"Error: {str(e)}\n{traceback.format_exc()}")
-        return None, current_seed
 with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo:  # cleanup: check every 1h, delete files >2h old
@@ -462,6 +581,10 @@ with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo:  # c
             )
             duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
             generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
             with gr.Accordion("Advanced Settings", open=False):
@@ -530,10 +653,11 @@ with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo:  # c
     )
     generate_btn.click(
-        fn=generate_video,
         inputs=[
             first_image, last_image, prompt, duration, enhance_prompt,
             seed, randomize_seed, height, width, negative_prompt, blur_amount, remove_music,
         ],
         outputs=[output_video, seed],
     )

 # Resolution presets: (width, height)
 RESOLUTIONS = {
+    "high": {"16:9": (1024, 640), "9:16": (640, 1024), "1:1": (1024, 1024)},
     "low": {"16:9": (512, 320), "9:16": (320, 512), "1:1": (512, 512)},
 }
     return blurred
+def loop_clips_with_audio_track(clip_paths: list[str], audio_path: str) -> str:
+    """Loop video clips to match audio duration. CPU work - free."""
+    import subprocess
+    from pydub import AudioSegment
+    try:
+        # Get audio duration
+        audio = AudioSegment.from_file(audio_path)
+        audio_duration = len(audio) / 1000.0  # Convert to seconds
+        # Get total clips duration
+        clips_duration = 0.0
+        for clip in clip_paths:
+            probe = subprocess.run([
+                'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
+                '-of', 'default=noprint_wrappers=1:nokey=1', clip
+            ], capture_output=True, text=True, check=True)
+            clips_duration += float(probe.stdout.strip())
+        # Calculate loop count
+        loop_count = int(audio_duration / clips_duration) + 1
+        print(f"[loop] Audio: {audio_duration:.2f}s, Clips: {clips_duration:.2f}s, Loops: {loop_count}")
+        # Create concat file with loops
+        concat_file = tempfile.mktemp(suffix=".txt")
+        with open(concat_file, 'w') as f:
+            for _ in range(loop_count):
+                for clip in clip_paths:
+                    f.write(f"file '{clip}'\n")
+        # Concat videos
+        concat_video = tempfile.mktemp(suffix=".mp4")
+        result = subprocess.run([
+            'ffmpeg', '-y', '-f', 'concat', '-safe', '0', '-i', concat_file,
+            '-c', 'copy', concat_video
+        ], capture_output=True, text=True)
+        if result.returncode != 0:
+            raise Exception(f"Concat failed: {result.stderr[-200:]}")
+        # Replace audio and trim to audio duration
+        final_video = tempfile.mktemp(suffix=".mp4")
+        result = subprocess.run([
+            'ffmpeg', '-y',
+            '-i', concat_video,
+            '-i', audio_path,
+            '-map', '0:v:0', '-map', '1:a:0',
+            '-c:v', 'copy', '-c:a', 'aac', '-b:a', '192k',
+            '-t', str(audio_duration),
+            '-shortest',
+            final_video
+        ], capture_output=True, text=True)
+        if result.returncode != 0:
+            raise Exception(f"Audio merge failed: {result.stderr[-200:]}")
+        print(f"[loop] Created looped video: {audio_duration:.2f}s")
+        return final_video
+    except Exception as e:
+        print(f"[loop] Error: {e}")
+        import traceback
+        traceback.print_exc()
+        return clip_paths[0] if clip_paths else None
 @spaces.GPU(duration=90)
 @torch.inference_mode()
 def generate_video(
     negative_prompt: str = DEFAULT_NEGATIVE_PROMPT,
     blur_amount: int = 0,
     remove_music: bool = False,
+    num_clips: int = 1,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
         torch.cuda.reset_peak_memory_stats()
         log_memory("start")
+        base_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
+        generated_clips = []
+        # Generate multiple clips in one GPU session
+        for clip_idx in range(num_clips):
+            current_seed = base_seed + clip_idx
+            print(f"[GPU] Generating clip {clip_idx + 1}/{num_clips}, seed={current_seed}")
+            frame_rate = DEFAULT_FRAME_RATE
+            num_frames = int(duration * frame_rate) + 1
+            num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
+            print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
+            images = []
+            output_dir = Path("outputs")
+            output_dir.mkdir(exist_ok=True)
+            if first_image is not None:
+                temp_first_path = output_dir / f"temp_first_{current_seed}.jpg"
+                if hasattr(first_image, "save"):
+                    first_image.save(temp_first_path)
+                else:
+                    temp_first_path = Path(first_image)
+                images.append(ImageConditioningInput(path=str(temp_first_path), frame_idx=0, strength=1.0))
+            if last_image is not None:
+                temp_last_path = output_dir / f"temp_last_{current_seed}.jpg"
+                if hasattr(last_image, "save"):
+                    last_image.save(temp_last_path)
+                else:
+                    temp_last_path = Path(last_image)
+                images.append(ImageConditioningInput(path=str(temp_last_path), frame_idx=num_frames - 1, strength=1.0))
+            from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
+            tiling_config = TilingConfig.default()
+            video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
+            log_memory("before pipeline call")
+            # Run inference - DistilledPipeline has simpler API
+            video_frames_iter, audio = pipeline(
+                prompt=prompt,
+                seed=current_seed,
+                height=int(height),
+                width=int(width),
+                num_frames=num_frames,
+                frame_rate=frame_rate,
+                images=images,
+                enhance_prompt=enhance_prompt,
+            )
+            # Collect video frames
+            frames = [frame for frame in video_frames_iter]
+            video_tensor = torch.cat(frames, dim=0) if len(frames) > 1 else frames[0]
+            log_memory("after pipeline call")
+            # Apply Gaussian blur if requested (for censoring/teaser effect)
+            if blur_amount > 0:
+                print(f"Applying Gaussian blur (amount={blur_amount})...")
+                video_tensor = apply_gaussian_blur(video_tensor, blur_amount)
+                log_memory("after blur")
+            output_path = tempfile.mktemp(suffix=".mp4")
+            encode_video(
+                video=video_tensor,
+                fps=frame_rate,
+                audio=audio,
+                output_path=output_path,
+                video_chunks_number=video_chunks_number,
+            )
+            log_memory("after encode_video")
+            # Remove background music if requested
+            if remove_music:
+                print(f"Removing background music with Demucs...")
+                processed_path = tempfile.mktemp(suffix=".mp4")
+                success = remove_music_demucs(output_path, processed_path)
+                if success:
+                    output_path = processed_path
+                    log_memory("after demucs")
+                else:
+                    print(f"Warning: Music removal failed, using original video")
+            generated_clips.append(str(output_path))
+        # Return all generated clips
+        return generated_clips, base_seed
     except Exception as e:
         import traceback
         log_memory("on error")
         print(f"Error: {str(e)}\n{traceback.format_exc()}")
+        return [], base_seed
+def full_generation_process(
+    first_image,
+    last_image,
+    prompt: str,
+    duration: float,
+    enhance_prompt: bool,
+    seed: int,
+    randomize_seed: bool,
+    height: int,
+    width: int,
+    negative_prompt: str,
+    blur_amount: int,
+    remove_music: bool,
+    num_clips: int,
+    audio_track,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """Main entry point: generates clips (GPU) then optionally loops with audio (CPU)."""
+    # Phase 1: Generate clips (GPU time counted)
+    clips, final_seed = generate_video(
+        first_image, last_image, prompt, duration, enhance_prompt,
+        seed, randomize_seed, height, width, negative_prompt,
+        blur_amount, remove_music, num_clips, progress
+    )
+    if not clips:
+        return None, final_seed
+    # Phase 2: CPU work (free) - loop clips with audio if provided
+    if audio_track and num_clips > 1:
+        print("[CPU] Looping clips to match audio duration...")
+        final_video = loop_clips_with_audio_track(clips, audio_track)
+        return final_video, final_seed
+    elif num_clips == 1:
+        # Single clip - return it directly
+        return clips[0], final_seed
+    else:
+        # Multiple clips, no audio - return first clip (could be gallery in future)
+        return clips[0], final_seed
 with gr.Blocks(title="Element-8 Video", delete_cache=(3600, 7200)) as demo:  # cleanup: check every 1h, delete files >2h old
             )
             duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
+            with gr.Row():
+                num_clips = gr.Slider(label="Number of Clips", info="Generate multiple variations", minimum=1, maximum=3, value=1, step=1)
+                audio_track = gr.Audio(label="Audio Track (Optional)", type="filepath", sources=["upload"])
             generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
             with gr.Accordion("Advanced Settings", open=False):
     )
     generate_btn.click(
+        fn=full_generation_process,
         inputs=[
             first_image, last_image, prompt, duration, enhance_prompt,
             seed, randomize_seed, height, width, negative_prompt, blur_amount, remove_music,
+            num_clips, audio_track,
         ],
         outputs=[output_video, seed],
     )

requirements.txt CHANGED Viewed

@@ -11,4 +11,5 @@ scikit-image>=0.25.2
 flashpack==0.1.2
 torchaudio==2.8.0
 demucs
-soundfile

 flashpack==0.1.2
 torchaudio==2.8.0
 demucs
+soundfile
+pydub