Spaces:

KangLiao
/

Puffin

Running on Zero

App Files Files Community

KangLiao commited on Oct 12

Commit

789fb80

1 Parent(s): 341d585

Re-track binary file with Git LFS

Browse files

Files changed (2) hide show

app.py +20 -9
assets/Puffin.png +3 -0

app.py CHANGED Viewed

@@ -63,7 +63,7 @@ info_vae = model.vae.load_state_dict(checkpoint_vae, strict=False)
 description = r"""
 <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
-🔥 We make the first attempt to seamlessly integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (Puffin) to advance multimodal spatial intelligence.<br>
 🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
 """
@@ -89,6 +89,17 @@ If you have any questions, please feel free to reach me out at <b>kang.liao@ntu.
 """
 @torch.inference_mode()
 @spaces.GPU(duration=120)
 # Multimodal Understanding function
@@ -194,7 +205,6 @@ def generate_image(prompt_scene,
     cam_map = cam_map / (math.pi / 2)
     prompt = prompt_scene + " " + prompt_camera
-    print("prompt:", prompt)
     bsz = 4
     with torch.no_grad():
@@ -314,14 +324,15 @@ custom_css = """
 with gr.Blocks(css=custom_css) as demo:
     #gr.Markdown("# Puffin")
     gr.Markdown(description)
-    with gr.Tab("Camera-controllable Image Generation"):
-        gr.Markdown(value="## Camera-controllable Image Generation")
-        prompt_input = gr.Textbox(label="Scene Prompt")
-        with gr.Accordion("Camera Parameters (in radius)", open=True):
             with gr.Row():
                 roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
                 pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
@@ -330,7 +341,7 @@ with gr.Blocks(css=custom_css) as demo:
         generation_button = gr.Button("Generate Images")
-        image_output = gr.Gallery(label="Generated Images", columns=4, rows=1)
         examples_t2i = gr.Examples(
             label="Prompt examples",
@@ -352,13 +363,13 @@ with gr.Blocks(css=custom_css) as demo:
         understanding_button = gr.Button("Chat")
         understanding_output = gr.Textbox(label="Response")
-        camera_map = gr.Image(label="Camera Maps")
         with gr.Accordion("Advanced options", open=False):
             und_seed_input = gr.Number(label="Seed", precision=0, value=42)
         examples_inpainting = gr.Examples(
-            label="Camera Understanding examples",
             examples=[
                 "assets/1.jpg",
                 "assets/2.jpg",

 description = r"""
 <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
+🔥 We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (Puffin) to advance multimodal spatial intelligence.<br>
 🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
 """
 """
+import base64
+with open("assets/Puffin.png", "rb") as f:
+    img_bytes = f.read()
+img_b64 = base64.b64encode(img_bytes).decode()
+html_img = f'''
+<div style="display:flex; justify-content:center; align-items:center; width:100%;">
+    <img src="data:image/png;base64,{img_b64}" style="border:none; width:200px; height:auto;"/>
+</div>
+'''
 @torch.inference_mode()
 @spaces.GPU(duration=120)
 # Multimodal Understanding function
     cam_map = cam_map / (math.pi / 2)
     prompt = prompt_scene + " " + prompt_camera
     bsz = 4
     with torch.no_grad():
 with gr.Blocks(css=custom_css) as demo:
     #gr.Markdown("# Puffin")
+    gr.HTML(html_img)
     gr.Markdown(description)
+    with gr.Tab("Camera-controllable Generation"):
+        gr.Markdown(value="## Camera-controllable Generation")
+        prompt_input = gr.Textbox(label="Scene prompt")
+        with gr.Accordion("Camera parameters (in radius)", open=True):
             with gr.Row():
                 roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
                 pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
         generation_button = gr.Button("Generate Images")
+        image_output = gr.Gallery(label="Generated images", columns=4, rows=1)
         examples_t2i = gr.Examples(
             label="Prompt examples",
         understanding_button = gr.Button("Chat")
         understanding_output = gr.Textbox(label="Response")
+        camera_map = gr.Image(label="Camera maps (up vector and latitude)")
         with gr.Accordion("Advanced options", open=False):
             und_seed_input = gr.Number(label="Seed", precision=0, value=42)
         examples_inpainting = gr.Examples(
+            label="Examples",
             examples=[
                 "assets/1.jpg",
                 "assets/2.jpg",

assets/Puffin.png ADDED Viewed

Git LFS Details

SHA256: 93b0f4acc80b7ab2f928670f0887ff2b8a33fbac1c1970a0e6f941a8f1d43015
Pointer size: 132 Bytes
Size of remote file: 3.03 MB