JasperHaozhe commited on
Commit
b940192
·
verified ·
1 Parent(s): 4a3abf3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -17
app.py CHANGED
@@ -1,25 +1,39 @@
1
  import spaces
2
  import gradio as gr
 
3
  from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
4
  from transformers.image_utils import load_image
5
  from threading import Thread
6
  import torch
 
7
 
8
  from serve_constants import html_header, bibtext, learn_more_markdown, tos_markdown
9
 
10
- MODEL_ID = "JasperHaozhe/RationalRewards-Both-Demo"
 
 
11
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
12
  model = AutoModelForImageTextToText.from_pretrained(
13
  MODEL_ID,
14
  trust_remote_code=True,
15
  torch_dtype=torch.bfloat16
16
- ).to("cuda").eval()
 
 
 
 
 
 
 
 
 
17
 
18
  TASK_CHOICES = [
19
  "Pointwise - Image Editing",
20
  "Pointwise - T2I Generation",
21
  "Pairwise - Image Editing",
22
  "Pairwise - T2I Generation",
 
23
  ]
24
 
25
  # ============================================================
@@ -318,40 +332,107 @@ def update_ui_for_task(task_type):
318
  gr.update(visible=False, label="(unused)", value=None),
319
  gr.update(label="Text-to-Image Prompt", placeholder="Enter the text-to-image generation prompt…"),
320
  )
 
 
 
 
 
 
 
 
 
321
 
322
  @spaces.GPU
323
  def model_inference(task_type, instruction_text, image1, image2, image3):
324
  """Run model inference based on the selected task type and uploaded images."""
 
 
 
 
 
325
  # Validate inputs and collect images based on task
326
  if task_type == "Pointwise - Image Editing":
327
  if not image1 or not image2:
328
- yield "Error: Please upload both Source Image and Edited Image."
329
  return
330
  files = [image1, image2]
 
 
331
  elif task_type == "Pointwise - T2I Generation":
332
  if not image1:
333
- yield "Error: Please upload the Generated Image."
334
  return
335
  files = [image1]
 
 
336
  elif task_type == "Pairwise - Image Editing":
337
  if not image1 or not image2 or not image3:
338
- yield "Error: Please upload Source Image, Image A, and Image B."
339
  return
340
  files = [image1, image2, image3]
 
 
341
  elif task_type == "Pairwise - T2I Generation":
342
  if not image1 or not image2:
343
- yield "Error: Please upload both Image A and Image B."
344
  return
345
  files = [image1, image2]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  else:
347
- yield "Error: Unknown task type selected."
348
  return
349
 
350
- # Load images
351
- loaded_images = [load_image(image) for image in files]
352
-
353
  # Build instruction with <image> placeholders
354
- instruction = create_instruction(instruction_text, task_type)
355
 
356
  # Interleave images into the <image> placeholders
357
  content = []
@@ -363,6 +444,9 @@ def model_inference(task_type, instruction_text, image1, image2, image3):
363
 
364
  messages = [{"role": "user", "content": content}]
365
 
 
 
 
366
  # Generate and stream text
367
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
368
  inputs = processor(
@@ -373,15 +457,19 @@ def model_inference(task_type, instruction_text, image1, image2, image3):
373
  ).to("cuda")
374
 
375
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
376
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
377
 
378
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
379
  thread.start()
380
 
381
  buffer = ""
382
- for new_text in streamer:
383
- buffer += new_text
384
- yield buffer
 
 
 
 
385
 
386
  # ============================================================
387
  # Gradio UI
@@ -398,6 +486,9 @@ This demo supports **four evaluation tasks**. Select one to get started:
398
  | **Pointwise – T2I Generation** | Rate a single generated image against a text-to-image prompt. Produces per-aspect scores and a refined prompt. |
399
  | **Pairwise – Image Editing** | Compare two edited images (A vs B) given a source image and editing instruction. Determines which edit is better per aspect. |
400
  | **Pairwise – T2I Generation** | Compare two generated images (A vs B) given a text-to-image prompt. Determines which generation is better per aspect. |
 
 
 
401
  """
402
 
403
  with gr.Blocks(css="""
@@ -408,7 +499,7 @@ with gr.Blocks(css="""
408
  # ---- Overview ----
409
  gr.Markdown(OVERVIEW_MD)
410
 
411
- with gr.Row(equal_height=False):
412
  # ============ LEFT COLUMN – all inputs (scrollable) ============
413
  with gr.Column(scale=1, elem_id="input-panel"):
414
  task_selector = gr.Radio(
@@ -452,6 +543,8 @@ with gr.Blocks(css="""
452
  gr.Examples(
453
  examples=[
454
  ["Pointwise - Image Editing", "Remove the arrows from the blue sign and add the text of Detour ahead, no right turns.", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_source.png", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_ovis_u1_Image A.png", None],
 
 
455
  ],
456
  inputs=[task_selector, instruction, image1, image2, image3],
457
  )
@@ -471,7 +564,7 @@ with gr.Blocks(css="""
471
  submit_btn.click(
472
  fn=model_inference,
473
  inputs=[task_selector, instruction, image1, image2, image3],
474
- outputs=output,
475
  )
476
 
477
  gr.Markdown(tos_markdown)
 
1
  import spaces
2
  import gradio as gr
3
+ from diffusers import FluxKontextPipeline
4
  from transformers import AutoProcessor, AutoModelForImageTextToText, TextIteratorStreamer
5
  from transformers.image_utils import load_image
6
  from threading import Thread
7
  import torch
8
+ import os
9
 
10
  from serve_constants import html_header, bibtext, learn_more_markdown, tos_markdown
11
 
12
+ MODEL_ID = "JasperHaozhe/RationalRewards-Both-Demo"
13
+ FLUX_MODEL_ID = "black-forest-labs/FLUX.1-Kontext-dev"
14
+
15
  processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
16
  model = AutoModelForImageTextToText.from_pretrained(
17
  MODEL_ID,
18
  trust_remote_code=True,
19
  torch_dtype=torch.bfloat16
20
+ ).to("cpu").eval()
21
+
22
+ # Load Flux Pipeline
23
+ flux_pipeline = FluxKontextPipeline.from_pretrained(
24
+ FLUX_MODEL_ID,
25
+ torch_dtype=torch.bfloat16
26
+ )
27
+ # Fix VAE precision for Flux to avoid artifacts
28
+ flux_pipeline.vae.to(dtype=torch.float32)
29
+ flux_pipeline.to("cpu")
30
 
31
  TASK_CHOICES = [
32
  "Pointwise - Image Editing",
33
  "Pointwise - T2I Generation",
34
  "Pairwise - Image Editing",
35
  "Pairwise - T2I Generation",
36
+ "Prompt Tuning - Image Editing",
37
  ]
38
 
39
  # ============================================================
 
332
  gr.update(visible=False, label="(unused)", value=None),
333
  gr.update(label="Text-to-Image Prompt", placeholder="Enter the text-to-image generation prompt…"),
334
  )
335
+ elif task_type == "Prompt Tuning - Image Editing":
336
+ return (
337
+ gr.update(visible=True, label="Source Image"),
338
+ gr.update(visible=True, label="Generated Image", interactive=False, value=None),
339
+ gr.update(visible=False, label="(unused)", value=None),
340
+ gr.update(label="Instruction", placeholder="Enter the instruction for editing..."),
341
+ )
342
+ else:
343
+ raise ValueError(f"Unknown task type: {task_type}")
344
 
345
  @spaces.GPU
346
  def model_inference(task_type, instruction_text, image1, image2, image3):
347
  """Run model inference based on the selected task type and uploaded images."""
348
+
349
+ loaded_images = []
350
+ task_for_template = task_type
351
+ generated_image = None
352
+
353
  # Validate inputs and collect images based on task
354
  if task_type == "Pointwise - Image Editing":
355
  if not image1 or not image2:
356
+ yield "Error: Please upload both Source Image and Edited Image.", None
357
  return
358
  files = [image1, image2]
359
+ loaded_images = [load_image(img) for img in files]
360
+
361
  elif task_type == "Pointwise - T2I Generation":
362
  if not image1:
363
+ yield "Error: Please upload the Generated Image.", None
364
  return
365
  files = [image1]
366
+ loaded_images = [load_image(img) for img in files]
367
+
368
  elif task_type == "Pairwise - Image Editing":
369
  if not image1 or not image2 or not image3:
370
+ yield "Error: Please upload Source Image, Image A, and Image B.", None
371
  return
372
  files = [image1, image2, image3]
373
+ loaded_images = [load_image(img) for img in files]
374
+
375
  elif task_type == "Pairwise - T2I Generation":
376
  if not image1 or not image2:
377
+ yield "Error: Please upload both Image A and Image B.", None
378
  return
379
  files = [image1, image2]
380
+ loaded_images = [load_image(img) for img in files]
381
+
382
+ elif task_type == "Prompt Tuning - Image Editing":
383
+ if not image1:
384
+ yield "Error: Please upload the Source Image.", None
385
+ return
386
+
387
+ yield "Generating edited image with Flux... (This may take a minute)", None
388
+
389
+ # Load source image
390
+ try:
391
+ source_img = load_image(image1)
392
+ width, height = source_img.size
393
+
394
+ # Ensure model is offloaded to CPU to make space for Flux
395
+ model.to("cpu")
396
+ torch.cuda.empty_cache()
397
+
398
+ # Move Flux to CUDA
399
+ flux_pipeline.to("cuda")
400
+
401
+ # Run Flux
402
+ generator = torch.Generator("cuda").manual_seed(42)
403
+ with torch.no_grad():
404
+ generated_image = flux_pipeline(
405
+ prompt=instruction_text,
406
+ image=source_img,
407
+ guidance_scale=3.5,
408
+ num_inference_steps=28,
409
+ width=width,
410
+ height=height,
411
+ generator=generator,
412
+ ).images[0]
413
+
414
+ # Move Flux back to CPU
415
+ flux_pipeline.to("cpu")
416
+ torch.cuda.empty_cache()
417
+
418
+ except Exception as e:
419
+ # Attempt to recover state
420
+ flux_pipeline.to("cpu")
421
+ torch.cuda.empty_cache()
422
+ yield f"Error generating image: {str(e)}", None
423
+ return
424
+
425
+ yield "Image generated! Evaluating...", generated_image
426
+
427
+ loaded_images = [source_img, generated_image]
428
+ task_for_template = "Pointwise - Image Editing"
429
+
430
  else:
431
+ yield "Error: Unknown task type selected.", None
432
  return
433
 
 
 
 
434
  # Build instruction with <image> placeholders
435
+ instruction = create_instruction(instruction_text, task_for_template)
436
 
437
  # Interleave images into the <image> placeholders
438
  content = []
 
444
 
445
  messages = [{"role": "user", "content": content}]
446
 
447
+ # Ensure model is on CUDA for evaluation
448
+ model.to("cuda")
449
+
450
  # Generate and stream text
451
  prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
452
  inputs = processor(
 
457
  ).to("cuda")
458
 
459
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
460
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048, temperature=0.3)
461
 
462
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
463
  thread.start()
464
 
465
  buffer = ""
466
+ try:
467
+ for new_text in streamer:
468
+ buffer += new_text
469
+ yield buffer, gr.update()
470
+ finally:
471
+
472
+ pass
473
 
474
  # ============================================================
475
  # Gradio UI
 
486
  | **Pointwise – T2I Generation** | Rate a single generated image against a text-to-image prompt. Produces per-aspect scores and a refined prompt. |
487
  | **Pairwise – Image Editing** | Compare two edited images (A vs B) given a source image and editing instruction. Determines which edit is better per aspect. |
488
  | **Pairwise – T2I Generation** | Compare two generated images (A vs B) given a text-to-image prompt. Determines which generation is better per aspect. |
489
+ | **Prompt Tuning – Image Editing** | Generate an edit using Flux (Kontext) from a source image and instruction, then evaluate it. Use the refinement to tune your prompt. |
490
+
491
+ **Try the examples below - they're basically begging to be clicked! 🎯**
492
  """
493
 
494
  with gr.Blocks(css="""
 
499
  # ---- Overview ----
500
  gr.Markdown(OVERVIEW_MD)
501
 
502
+ with gr.Row(equal_height=True):
503
  # ============ LEFT COLUMN – all inputs (scrollable) ============
504
  with gr.Column(scale=1, elem_id="input-panel"):
505
  task_selector = gr.Radio(
 
543
  gr.Examples(
544
  examples=[
545
  ["Pointwise - Image Editing", "Remove the arrows from the blue sign and add the text of Detour ahead, no right turns.", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_source.png", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_ovis_u1_Image A.png", None],
546
+ ["Pairwise - Image Editing", "Remove the arrows from the blue sign and add the text of Detour ahead, no right turns.", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_source.png", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_ovis_u1_Image A.png", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_ovis_u1_Image A.png"],
547
+ ["Prompt Tuning - Image Editing", "Remove the arrows from the blue sign and add the text of Detour ahead, no right turns.", "example_images/0016cb70b187efe39969766dc4b3f9ed_b63ed6db519f685c33b860b511879cfe2fa7351059a17ebe5eafa83213e222fb_13_source.png", None, None],
548
  ],
549
  inputs=[task_selector, instruction, image1, image2, image3],
550
  )
 
564
  submit_btn.click(
565
  fn=model_inference,
566
  inputs=[task_selector, instruction, image1, image2, image3],
567
+ outputs=[output, image2],
568
  )
569
 
570
  gr.Markdown(tos_markdown)