Spaces:

KangLiao
/

Puffin

Running on Zero

KangLiao commited on Oct 9, 2025

Commit

06869b4

1 Parent(s): f18fdea

init

Files changed (1) hide show

app.py CHANGED Viewed

@@ -47,6 +47,18 @@ checkpoint_path = "checkpoints/Puffin-Base.pth"
 checkpoint = torch.load(checkpoint_path)
 info = model.load_state_dict(checkpoint, strict=False)
 @torch.inference_mode()
 @spaces.GPU(duration=120)
@@ -88,15 +100,11 @@ def camera_understanding(image_src, question, seed, progress=gr.Progress(track_t
     single_batch["latitude_field"] = cam[2:].unsqueeze(0)
     figs = make_perspective_figures(single_batch, single_batch, n_pairs=1)
-    imgs = []
-    for k, fig in figs.items():
-        fig.canvas.draw()
-        img = np.array(fig.canvas.renderer.buffer_rgba())
-        imgs.append(img)
-        plt.close(fig)
-    merged_imgs = np.concatenate(imgs, axis=1)
-    return text, merged_imgs
 @torch.inference_mode()
@@ -192,7 +200,8 @@ with gr.Blocks(css=css) as demo:
         understanding_button = gr.Button("Chat")
         understanding_output = gr.Textbox(label="Response")
-        camera_output = gr.Gallery(label="Camera Maps", columns=1, rows=1)
         with gr.Accordion("Advanced options", open=False):
             und_seed_input = gr.Number(label="Seed", precision=0, value=42)
@@ -215,7 +224,7 @@ with gr.Blocks(css=css) as demo:
     understanding_button.click(
         camera_understanding,
         inputs=[image_input, und_seed_input],
-        outputs=[understanding_output, camera_output]
     )
 demo.launch(share=True)

 checkpoint = torch.load(checkpoint_path)
 info = model.load_state_dict(checkpoint, strict=False)
+def extract_up_lat_figs(fig_dict):
+    fig_up, fig_lat = None, None
+    others = {}
+    for k, fig in fig_dict.items():
+        if ("up_field" in k) and (fig_up is None):
+            fig_up = fig
+        elif ("latitude_field" in k) and (fig_lat is None):
+            fig_lat = fig
+        else:
+            others[k] = fig
+    return fig_up, fig_lat, others
 @torch.inference_mode()
 @spaces.GPU(duration=120)
     single_batch["latitude_field"] = cam[2:].unsqueeze(0)
     figs = make_perspective_figures(single_batch, single_batch, n_pairs=1)
+    fig_up, fig_lat, _ = extract_up_lat_figs(figs)
+    return text, fig_up, fig_lat
 @torch.inference_mode()
         understanding_button = gr.Button("Chat")
         understanding_output = gr.Textbox(label="Response")
+        camera1 = gr.Gallery(label="Camera Maps", columns=1, rows=1)
+        camera2 = gr.Gallery(label="Camera Maps", columns=1, rows=1)
         with gr.Accordion("Advanced options", open=False):
             und_seed_input = gr.Number(label="Seed", precision=0, value=42)
     understanding_button.click(
         camera_understanding,
         inputs=[image_input, und_seed_input],
+        outputs=[understanding_output, camera1, camera2]
     )
 demo.launch(share=True)