KangLiao commited on
Commit
789fb80
·
1 Parent(s): 341d585

Re-track binary file with Git LFS

Browse files
Files changed (2) hide show
  1. app.py +20 -9
  2. assets/Puffin.png +3 -0
app.py CHANGED
@@ -63,7 +63,7 @@ info_vae = model.vae.load_state_dict(checkpoint_vae, strict=False)
63
 
64
  description = r"""
65
  <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
66
- 🔥 We make the first attempt to seamlessly integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (Puffin) to advance multimodal spatial intelligence.<br>
67
  🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
68
  """
69
 
@@ -89,6 +89,17 @@ If you have any questions, please feel free to reach me out at <b>kang.liao@ntu.
89
  """
90
 
91
 
 
 
 
 
 
 
 
 
 
 
 
92
  @torch.inference_mode()
93
  @spaces.GPU(duration=120)
94
  # Multimodal Understanding function
@@ -194,7 +205,6 @@ def generate_image(prompt_scene,
194
  cam_map = cam_map / (math.pi / 2)
195
 
196
  prompt = prompt_scene + " " + prompt_camera
197
- print("prompt:", prompt)
198
 
199
  bsz = 4
200
  with torch.no_grad():
@@ -314,14 +324,15 @@ custom_css = """
314
 
315
  with gr.Blocks(css=custom_css) as demo:
316
  #gr.Markdown("# Puffin")
 
317
  gr.Markdown(description)
318
 
319
- with gr.Tab("Camera-controllable Image Generation"):
320
- gr.Markdown(value="## Camera-controllable Image Generation")
321
 
322
- prompt_input = gr.Textbox(label="Scene Prompt")
323
 
324
- with gr.Accordion("Camera Parameters (in radius)", open=True):
325
  with gr.Row():
326
  roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
327
  pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
@@ -330,7 +341,7 @@ with gr.Blocks(css=custom_css) as demo:
330
 
331
  generation_button = gr.Button("Generate Images")
332
 
333
- image_output = gr.Gallery(label="Generated Images", columns=4, rows=1)
334
 
335
  examples_t2i = gr.Examples(
336
  label="Prompt examples",
@@ -352,13 +363,13 @@ with gr.Blocks(css=custom_css) as demo:
352
  understanding_button = gr.Button("Chat")
353
  understanding_output = gr.Textbox(label="Response")
354
 
355
- camera_map = gr.Image(label="Camera Maps")
356
 
357
  with gr.Accordion("Advanced options", open=False):
358
  und_seed_input = gr.Number(label="Seed", precision=0, value=42)
359
 
360
  examples_inpainting = gr.Examples(
361
- label="Camera Understanding examples",
362
  examples=[
363
  "assets/1.jpg",
364
  "assets/2.jpg",
 
63
 
64
  description = r"""
65
  <b>Official Gradio demo</b> for <a href='https://kangliao929.github.io/projects/puffin/' target='_blank'><b>Thinking with Camera: A Unified Multimodal Model for Camera-Centric Understanding and Generation</b></a>.<br>
66
+ 🔥 We make the first attempt to integrate camera geometry into a unified multimodal model, introducing a camera-centric framework (Puffin) to advance multimodal spatial intelligence.<br>
67
  🖼️ Try to switch the tasks and choose different prompts or images to get the generation or understanding results.<br>
68
  """
69
 
 
89
  """
90
 
91
 
92
+ import base64
93
+ with open("assets/Puffin.png", "rb") as f:
94
+ img_bytes = f.read()
95
+ img_b64 = base64.b64encode(img_bytes).decode()
96
+
97
+ html_img = f'''
98
+ <div style="display:flex; justify-content:center; align-items:center; width:100%;">
99
+ <img src="data:image/png;base64,{img_b64}" style="border:none; width:200px; height:auto;"/>
100
+ </div>
101
+ '''
102
+
103
  @torch.inference_mode()
104
  @spaces.GPU(duration=120)
105
  # Multimodal Understanding function
 
205
  cam_map = cam_map / (math.pi / 2)
206
 
207
  prompt = prompt_scene + " " + prompt_camera
 
208
 
209
  bsz = 4
210
  with torch.no_grad():
 
324
 
325
  with gr.Blocks(css=custom_css) as demo:
326
  #gr.Markdown("# Puffin")
327
+ gr.HTML(html_img)
328
  gr.Markdown(description)
329
 
330
+ with gr.Tab("Camera-controllable Generation"):
331
+ gr.Markdown(value="## Camera-controllable Generation")
332
 
333
+ prompt_input = gr.Textbox(label="Scene prompt")
334
 
335
+ with gr.Accordion("Camera parameters (in radius)", open=True):
336
  with gr.Row():
337
  roll = gr.Slider(minimum=-0.7854, maximum=0.7854, value=0.1000, step=0.1000, label="roll value")
338
  pitch = gr.Slider(minimum=-0.7854, maximum=0.7854, value=-0.1000, step=0.1000, label="pitch value")
 
341
 
342
  generation_button = gr.Button("Generate Images")
343
 
344
+ image_output = gr.Gallery(label="Generated images", columns=4, rows=1)
345
 
346
  examples_t2i = gr.Examples(
347
  label="Prompt examples",
 
363
  understanding_button = gr.Button("Chat")
364
  understanding_output = gr.Textbox(label="Response")
365
 
366
+ camera_map = gr.Image(label="Camera maps (up vector and latitude)")
367
 
368
  with gr.Accordion("Advanced options", open=False):
369
  und_seed_input = gr.Number(label="Seed", precision=0, value=42)
370
 
371
  examples_inpainting = gr.Examples(
372
+ label="Examples",
373
  examples=[
374
  "assets/1.jpg",
375
  "assets/2.jpg",
assets/Puffin.png ADDED

Git LFS Details

  • SHA256: 93b0f4acc80b7ab2f928670f0887ff2b8a33fbac1c1970a0e6f941a8f1d43015
  • Pointer size: 132 Bytes
  • Size of remote file: 3.03 MB