Re-track binary file with Git LFS
Browse files- app.py +20 -9
- assets/Puffin.png +3 -0
app.py
CHANGED
|
@@ -63,7 +63,7 @@ info_vae = model.vae.load_state_dict(checkpoint_vae, strict=False)
|
|
| 63 |
|
| 64 |
description = r"""
|
| 65 |
<b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
|
| 66 |
-
🔥 We make the first attempt to
|
| 67 |
🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
|
| 68 |
"""
|
| 69 |
|
|
@@ -89,6 +89,17 @@ If you have any questions, please feel free to reach me out at <b>kang.liao@ntu.
|
|
| 89 |
"""
|
| 90 |
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
@torch.inference_mode()
|
| 93 |
@spaces.GPU(duration=120)
|
| 94 |
# Multimodal Understanding function
|
|
@@ -194,7 +205,6 @@ def generate_image(prompt_scene,
|
|
| 194 |
cam_map = cam_map / (math.pi / 2)
|
| 195 |
|
| 196 |
prompt = prompt_scene + " " + prompt_camera
|
| 197 |
-
print("prompt:", prompt)
|
| 198 |
|
| 199 |
bsz = 4
|
| 200 |
with torch.no_grad():
|
|
@@ -314,14 +324,15 @@ custom_css = """
|
|
| 314 |
|
| 315 |
with gr.Blocks(css=custom_css) as demo:
|
| 316 |
#gr.Markdown("# Puffin")
|
|
|
|
| 317 |
gr.Markdown(description)
|
| 318 |
|
| 319 |
-
with gr.Tab("Camera-controllable
|
| 320 |
-
gr.Markdown(value="## Camera-controllable
|
| 321 |
|
| 322 |
-
prompt_input = gr.Textbox(label="Scene
|
| 323 |
|
| 324 |
-
with gr.Accordion("Camera
|
| 325 |
with gr.Row():
|
| 326 |
roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
|
| 327 |
pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
|
|
@@ -330,7 +341,7 @@ with gr.Blocks(css=custom_css) as demo:
|
|
| 330 |
|
| 331 |
generation_button = gr.Button("Generate Images")
|
| 332 |
|
| 333 |
-
image_output = gr.Gallery(label="Generated
|
| 334 |
|
| 335 |
examples_t2i = gr.Examples(
|
| 336 |
label="Prompt examples",
|
|
@@ -352,13 +363,13 @@ with gr.Blocks(css=custom_css) as demo:
|
|
| 352 |
understanding_button = gr.Button("Chat")
|
| 353 |
understanding_output = gr.Textbox(label="Response")
|
| 354 |
|
| 355 |
-
camera_map = gr.Image(label="Camera
|
| 356 |
|
| 357 |
with gr.Accordion("Advanced options", open=False):
|
| 358 |
und_seed_input = gr.Number(label="Seed", precision=0, value=42)
|
| 359 |
|
| 360 |
examples_inpainting = gr.Examples(
|
| 361 |
-
label="
|
| 362 |
examples=[
|
| 363 |
"assets/1.jpg",
|
| 364 |
"assets/2.jpg",
|
|
|
|
| 63 |
|
| 64 |
description = r"""
|
| 65 |
<b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
|
| 66 |
+
🔥 We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (Puffin) to advance multimodal spatial intelligence.<br>
|
| 67 |
🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
|
| 68 |
"""
|
| 69 |
|
|
|
|
| 89 |
"""
|
| 90 |
|
| 91 |
|
| 92 |
+
import base64
|
| 93 |
+
with open("assets/Puffin.png", "rb") as f:
|
| 94 |
+
img_bytes = f.read()
|
| 95 |
+
img_b64 = base64.b64encode(img_bytes).decode()
|
| 96 |
+
|
| 97 |
+
html_img = f'''
|
| 98 |
+
<div style="display:flex; justify-content:center; align-items:center; width:100%;">
|
| 99 |
+
<img src="data:image/png;base64,{img_b64}" style="border:none; width:200px; height:auto;"/>
|
| 100 |
+
</div>
|
| 101 |
+
'''
|
| 102 |
+
|
| 103 |
@torch.inference_mode()
|
| 104 |
@spaces.GPU(duration=120)
|
| 105 |
# Multimodal Understanding function
|
|
|
|
| 205 |
cam_map = cam_map / (math.pi / 2)
|
| 206 |
|
| 207 |
prompt = prompt_scene + " " + prompt_camera
|
|
|
|
| 208 |
|
| 209 |
bsz = 4
|
| 210 |
with torch.no_grad():
|
|
|
|
| 324 |
|
| 325 |
with gr.Blocks(css=custom_css) as demo:
|
| 326 |
#gr.Markdown("# Puffin")
|
| 327 |
+
gr.HTML(html_img)
|
| 328 |
gr.Markdown(description)
|
| 329 |
|
| 330 |
+
with gr.Tab("Camera-controllable Generation"):
|
| 331 |
+
gr.Markdown(value="## Camera-controllable Generation")
|
| 332 |
|
| 333 |
+
prompt_input = gr.Textbox(label="Scene prompt")
|
| 334 |
|
| 335 |
+
with gr.Accordion("Camera parameters (in radius)", open=True):
|
| 336 |
with gr.Row():
|
| 337 |
roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
|
| 338 |
pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
|
|
|
|
| 341 |
|
| 342 |
generation_button = gr.Button("Generate Images")
|
| 343 |
|
| 344 |
+
image_output = gr.Gallery(label="Generated images", columns=4, rows=1)
|
| 345 |
|
| 346 |
examples_t2i = gr.Examples(
|
| 347 |
label="Prompt examples",
|
|
|
|
| 363 |
understanding_button = gr.Button("Chat")
|
| 364 |
understanding_output = gr.Textbox(label="Response")
|
| 365 |
|
| 366 |
+
camera_map = gr.Image(label="Camera maps (up vector and latitude)")
|
| 367 |
|
| 368 |
with gr.Accordion("Advanced options", open=False):
|
| 369 |
und_seed_input = gr.Number(label="Seed", precision=0, value=42)
|
| 370 |
|
| 371 |
examples_inpainting = gr.Examples(
|
| 372 |
+
label="Examples",
|
| 373 |
examples=[
|
| 374 |
"assets/1.jpg",
|
| 375 |
"assets/2.jpg",
|
assets/Puffin.png
ADDED
|
Git LFS Details
|