Spaces:

LimaRaed
/

DS_TextToVideo

Sleeping

App Files Files Community

LimaRaed commited on May 4, 2025

Commit

2f7fdab

verified ·

1 Parent(s): 96c9fdb

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -31

app.py CHANGED Viewed

@@ -9,9 +9,8 @@ warnings.filterwarnings("ignore")
 # Set to use CPU
 torch_device = "cpu"
-torch_dtype = torch.float32  # Use float32 for CPU stability
-# Load a lightweight model
 def load_model():
     model_id = "damo-vilab/text-to-video-ms-1.7b"
     pipe = DiffusionPipeline.from_pretrained(
@@ -19,77 +18,73 @@ def load_model():
         torch_dtype=torch_dtype
     )
     pipe = pipe.to(torch_device)
-    pipe.enable_attention_slicing()  # Reduce memory usage
     return pipe
 def generate_video(prompt, num_frames=8, num_inference_steps=20):
     start_time = time.time()
-    # Load model with caching
     if not hasattr(generate_video, "pipe"):
         generate_video.pipe = load_model()
-    # Generate with lower resolution and fewer frames for CPU
     with torch.no_grad():
         output = generate_video.pipe(
             prompt,
-            num_frames=min(num_frames, 8),  # Keep frames low for CPU
             num_inference_steps=min(num_inference_steps, 20),
-            height=256,  # Lower resolution
             width=256
         )
-    # Convert numpy arrays to PIL Images
-    frames = [Image.fromarray((frame * 255).astype(np.uint8)) for frame in output.frames]
     # Create GIF
     gif_path = "output.gif"
-    duration = max(1000 // 3, 100)  # Minimum 100ms per frame
     frames[0].save(
         gif_path,
         save_all=True,
         append_images=frames[1:],
-        duration=duration,
         loop=0,
-        save_format='GIF'
     )
-    gen_time = time.time() - start_time
-    print(f"Generation took {gen_time:.2f} seconds")
     return gif_path
 # Gradio Interface
 with gr.Blocks(title="CPU Text-to-Video") as demo:
     gr.Markdown("# 🐢 CPU Text-to-Video Generator")
-    gr.Markdown("This version runs entirely on CPU - generations will be slower and lower quality")
     with gr.Row():
         with gr.Column():
-            prompt = gr.Textbox(label="Prompt", placeholder="A fish swimming in space")
             with gr.Accordion("Advanced Options", open=False):
                 frames = gr.Slider(4, 12, value=8, step=4, label="Frames")
                 steps = gr.Slider(10, 30, value=20, step=5, label="Steps")
-            submit = gr.Button("Generate", variant="primary")
         with gr.Column():
             output = gr.Image(label="Result", format="gif")
-            gr.Markdown("Note: On CPU, generation may take 5-15 minutes")
-    examples = gr.Examples(
-        examples=[
-            ["A paper boat floating on water"],
-            ["A sloth wearing sunglasses"],
-            ["A candle flame in the wind"]
-        ],
-        inputs=prompt,
-        label="Try these examples"
-    )
     submit.click(
         fn=generate_video,
         inputs=[prompt, frames, steps],
-        outputs=output,
-        api_name="generate"
     )
-demo.launch(show_api=False)

 # Set to use CPU
 torch_device = "cpu"
+torch_dtype = torch.float32
 def load_model():
     model_id = "damo-vilab/text-to-video-ms-1.7b"
     pipe = DiffusionPipeline.from_pretrained(
         torch_dtype=torch_dtype
     )
     pipe = pipe.to(torch_device)
+    pipe.enable_attention_slicing()
     return pipe
 def generate_video(prompt, num_frames=8, num_inference_steps=20):
     start_time = time.time()
     if not hasattr(generate_video, "pipe"):
         generate_video.pipe = load_model()
     with torch.no_grad():
         output = generate_video.pipe(
             prompt,
+            num_frames=min(num_frames, 8),
             num_inference_steps=min(num_inference_steps, 20),
+            height=256,
             width=256
         )
+    # Correct frame conversion - handle the 4D array properly
+    video_frames = output.frames
+    if isinstance(video_frames, np.ndarray):
+        # Reshape from (1, num_frames, height, width, 3) to (num_frames, height, width, 3)
+        if video_frames.ndim == 5:
+            video_frames = video_frames[0]  # Remove batch dimension
+        frames = []
+        for frame in video_frames:
+            # Convert to 8-bit and ensure correct channel order
+            frame = (frame * 255).astype(np.uint8)
+            frames.append(Image.fromarray(frame))
+    else:
+        raise ValueError("Unexpected frame format")
     # Create GIF
     gif_path = "output.gif"
     frames[0].save(
         gif_path,
         save_all=True,
         append_images=frames[1:],
+        duration=100,  # 100ms per frame
         loop=0,
+        quality=80
     )
+    print(f"Generation took {time.time() - start_time:.2f} seconds")
     return gif_path
 # Gradio Interface
 with gr.Blocks(title="CPU Text-to-Video") as demo:
     gr.Markdown("# 🐢 CPU Text-to-Video Generator")
     with gr.Row():
         with gr.Column():
+            prompt = gr.Textbox(label="Prompt")
             with gr.Accordion("Advanced Options", open=False):
                 frames = gr.Slider(4, 12, value=8, step=4, label="Frames")
                 steps = gr.Slider(10, 30, value=20, step=5, label="Steps")
+            submit = gr.Button("Generate")
         with gr.Column():
             output = gr.Image(label="Result", format="gif")
+            gr.Markdown("Note: CPU generation may take several minutes")
     submit.click(
         fn=generate_video,
         inputs=[prompt, frames, steps],
+        outputs=output
     )
+demo.launch()