geowizard-e2e-ft

Sleeping

App Files Files Community

x10z commited on May 3

Commit

2ab4d40

verified ·

1 Parent(s): 91ef6d2

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -19

app.py CHANGED Viewed

@@ -13,15 +13,15 @@ from GeoWizard.geowizard.models.unet_2d_condition import UNet2DConditionModel
 from GeoWizard.geowizard.models.geowizard_pipeline import DepthNormalEstimationPipeline
 # Device setup
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-checkpoint_path = "GonzaloMG/geowizard-e2e-ft"
 # Load pretrained components
-vae = AutoencoderKL.from_pretrained(checkpoint_path, subfolder='vae')
-scheduler = DDIMScheduler.from_pretrained(checkpoint_path, timestep_spacing="trailing", subfolder='scheduler')
-image_encoder = CLIPVisionModelWithProjection.from_pretrained(checkpoint_path, subfolder="image_encoder")
-feature_extractor = CLIPImageProcessor.from_pretrained(checkpoint_path, subfolder="feature_extractor")
-unet = UNet2DConditionModel.from_pretrained(checkpoint_path, subfolder="unet")
 # Instantiate pipeline
 pipe = DepthNormalEstimationPipeline(
@@ -30,14 +30,17 @@ pipe = DepthNormalEstimationPipeline(
     feature_extractor=feature_extractor,
     unet=unet,
     scheduler=scheduler
-).to(device)
 pipe.unet.eval()
 # UI texts
 title = "# End-to-End Fine-Tuned GeoWizard Video"
-description = """
-Please refer to our [paper](https://arxiv.org/abs/2409.11355) and [GitHub](https://vision.rwth-aachen.de/diffusion-e2e-ft) for more details.
-"""
 @spaces.GPU
 def predict(image: Image.Image, processing_res_choice: int):
@@ -70,24 +73,24 @@ def on_submit_video(video_path: str, processing_res_choice: int):
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    # Temporary output files
     tmp_depth = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
     tmp_normal = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out_depth = cv2.VideoWriter(tmp_depth.name, fourcc, fps, (width, height))
     out_normal = cv2.VideoWriter(tmp_normal.name, fourcc, fps, (width, height))
-    # Process frames
     for _ in tqdm(range(frame_count), desc="Processing frames"):
         ret, frame = cap.read()
         if not ret:
             break
-        # Convert BGR to RGB and to PIL
         rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         pil_image = Image.fromarray(rgb)
-        # Run prediction
         result = predict(pil_image, processing_res_choice)
         depth_colored = result.depth_colored
         normal_colored = result.normal_colored
@@ -107,9 +110,10 @@ def on_submit_video(video_path: str, processing_res_choice: int):
     out_depth.release()
     out_normal.release()
-    # Return paths for download
     return tmp_depth.name, tmp_normal.name
 # Build Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown(title)
@@ -117,7 +121,7 @@ with gr.Blocks() as demo:
     gr.Markdown("### Depth and Normals Prediction on Video")
     with gr.Row():
-                input_video = gr.Video(
             label="Input Video",
             elem_id='video-display-input'
         )
@@ -133,8 +137,14 @@ with gr.Blocks() as demo:
             submit = gr.Button(value="Compute Depth and Normals")
     with gr.Row():
-        output_depth_video = gr.Video(label="Depth Video", elem_id='download')
-        output_normal_video = gr.Video(label="Normal Video", elem_id='download')
     submit.click(
         fn=on_submit_video,

 from GeoWizard.geowizard.models.geowizard_pipeline import DepthNormalEstimationPipeline
 # Device setup
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+CHECKPOINT_PATH = "GonzaloMG/geowizard-e2e-ft"
 # Load pretrained components
+vae = AutoencoderKL.from_pretrained(CHECKPOINT_PATH, subfolder='vae')
+scheduler = DDIMScheduler.from_pretrained(CHECKPOINT_PATH, timestep_spacing="trailing", subfolder='scheduler')
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(CHECKPOINT_PATH, subfolder="image_encoder")
+feature_extractor = CLIPImageProcessor.from_pretrained(CHECKPOINT_PATH, subfolder="feature_extractor")
+unet = UNet2DConditionModel.from_pretrained(CHECKPOINT_PATH, subfolder="unet")
 # Instantiate pipeline
 pipe = DepthNormalEstimationPipeline(
     feature_extractor=feature_extractor,
     unet=unet,
     scheduler=scheduler
+).to(DEVICE)
 pipe.unet.eval()
 # UI texts
 title = "# End-to-End Fine-Tuned GeoWizard Video"
+description = (
+    """
+    Please refer to our [paper](https://arxiv.org/abs/2409.11355) and
+    [GitHub](https://vision.rwth-aachen.de/diffusion-e2e-ft) for more details.
+    """
+)
 @spaces.GPU
 def predict(image: Image.Image, processing_res_choice: int):
     height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
     frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Create temporary output files
     tmp_depth = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
     tmp_normal = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
     fourcc = cv2.VideoWriter_fourcc(*'mp4v')
     out_depth = cv2.VideoWriter(tmp_depth.name, fourcc, fps, (width, height))
     out_normal = cv2.VideoWriter(tmp_normal.name, fourcc, fps, (width, height))
+    # Process each frame
     for _ in tqdm(range(frame_count), desc="Processing frames"):
         ret, frame = cap.read()
         if not ret:
             break
+        # Convert frame to PIL image
         rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
         pil_image = Image.fromarray(rgb)
+        # Predict depth and normals
         result = predict(pil_image, processing_res_choice)
         depth_colored = result.depth_colored
         normal_colored = result.normal_colored
     out_depth.release()
     out_normal.release()
+    # Return video paths for download
     return tmp_depth.name, tmp_normal.name
 # Build Gradio interface
 with gr.Blocks() as demo:
     gr.Markdown(title)
     gr.Markdown("### Depth and Normals Prediction on Video")
     with gr.Row():
+        input_video = gr.Video(
             label="Input Video",
             elem_id='video-display-input'
         )
             submit = gr.Button(value="Compute Depth and Normals")
     with gr.Row():
+        output_depth_video = gr.Video(
+            label="Depth Video",
+            elem_id='download'
+        )
+        output_normal_video = gr.Video(
+            label="Normal Video",
+            elem_id='download'
+        )
     submit.click(
         fn=on_submit_video,