gen3-visual

Sleeping

App Files Files Community

sajofu commited on Aug 14, 2025

Commit

a87dbf7

verified ·

1 Parent(s): 6177680

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -83

app.py CHANGED Viewed

@@ -6,122 +6,87 @@ import torch
 from transformers import AutoModel, AutoProcessor
 from transformers import StoppingCriteria, TextIteratorStreamer, StoppingCriteriaList
-# Set the device for computation
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
-# Load the model and processor from Hugging Face
-# trust_remote_code=True is necessary for this model
 model = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True).to(device)
 processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
-# Define a custom stopping criteria to stop generation when the model outputs the end-of-text token
 class StopOnTokens(StoppingCriteria):
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        # The stop token ID for <|endoftext|>
-        stop_ids = [151645]
         for stop_id in stop_ids:
-            # Check if the last generated token is a stop token
             if input_ids[0][-1] == stop_id:
                 return True
         return False
 @torch.no_grad()
 def response(message, history, image):
-    """
-    This function generates the model's response. It handles both text-only and multimodal inputs,
-    builds the conversation history, and streams the response back to the UI.
-    """
     stop = StopOnTokens()
-    # 1. Build the conversation history
     messages = [{"role": "system", "content": "You are a helpful assistant."}]
     for user_msg, assistant_msg in history:
         messages.append({"role": "user", "content": user_msg})
-        if assistant_msg:
-            messages.append({"role": "assistant", "content": assistant_msg})
-    # 2. Prepare the prompt and model inputs for the current turn
-    prompt = message
-    model_kwargs = {}
-    # If an image is provided, process it and prepend the <image> token to the prompt
-    if image is not None:
-        prompt = f"<image>{message}"
-        # Process the image using the model's image_processor
-        processed_images = processor.image_processor(image, return_tensors="pt")['pixel_values'].to(device)
-        model_kwargs['images'] = processed_images
-    messages.append({"role": "user", "content": prompt})
-    # 3. Tokenize the conversation using the chat template
-    inputs = processor.tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         return_tensors="pt"
-    ).to(device)
-    model_kwargs['input_ids'] = inputs
-    # 4. Create an attention mask if an image is present
-    if image is not None:
-        # The attention mask needs to be manually created to account for the image tokens
-        attention_mask = torch.ones(
-            1, inputs.shape[1] + processor.num_image_latents - 1,
-            dtype=torch.long,
-            device=device
-        )
-        model_kwargs['attention_mask'] = attention_mask
-    # 5. Set up the streamer for text generation
-    streamer = TextIteratorStreamer(
-        processor.tokenizer,
-        timeout=30.,
-        skip_prompt=True,
-        skip_special_tokens=True
     )
     generate_kwargs = dict(
-        **model_kwargs,
         streamer=streamer,
         max_new_tokens=1024,
         stopping_criteria=StoppingCriteriaList([stop])
-    )
-    # Run generation in a separate thread to not block the UI
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
-    # 6. Stream the response to the Gradio UI
-    # Append the original user message (without <image> token) to the history for display
     history.append([message, ""])
     partial_response = ""
     for new_token in streamer:
-        # The model might output this token string instead of the ID
-        if new_token == '<|endoftext|>':
-            break
         partial_response += new_token
         history[-1][1] = partial_response
-        # Yield updates to the chatbot and buttons
-        yield history, gr.update(visible=False), gr.update(visible=True, interactive=True)
-with gr.Blocks(theme=gr.themes.Default(primary_hue="blue")) as demo:
-    gr.Markdown("# UForm-Gen2 DPO Chat Demo")
     with gr.Row():
-        image = gr.Image(type="pil", label="Upload Image (Optional)")
         with gr.Column():
-            chat = gr.Chatbot(label="Conversation", show_label=False, elem_id="chatbot")
-            message = gr.Textbox(
-                interactive=True,
-                show_label=False,
-                placeholder="Type your message or ask about the image...",
-                container=False
-            )
             with gr.Row():
-                gr.ClearButton([chat, message, image], value="🗑️ New Chat")
-                stop = gr.Button("⏹️ Stop", variant="stop", visible=False)
-                submit = gr.Button("▶️ Submit", variant="primary")
     with gr.Row():
         gr.Examples(
@@ -131,7 +96,7 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="blue")) as demo:
                 ["images/child.jpg", "Describe the image in one sentence."],
             ],
             [image, message],
-            label="Image Captioning Examples"
         )
         gr.Examples(
             [
@@ -140,34 +105,26 @@ with gr.Blocks(theme=gr.themes.Default(primary_hue="blue")) as demo:
                 ["images/three_people.jpg", "What are these people doing?"]
             ],
             [image, message],
-            label="Visual Question Answering (VQA) Examples"
         )
-    # Define the event handlers for submitting a message
     response_handler = (
         response,
         [message, chat, image],
         [chat, submit, stop]
     )
-    # This handler runs after the generation is complete to reset the button states
     postresponse_handler = (
-        lambda: (gr.update(visible=False), gr.update(visible=True, interactive=True)),
         None,
-        [stop, submit],
     )
-    # Register the event listeners
-    # Trigger generation on both text submission (Enter key) and button click
     event1 = message.submit(*response_handler)
     event1.then(*postresponse_handler)
     event2 = submit.click(*response_handler)
     event2.then(*postresponse_handler)
-    # The stop button cancels the generation events
     stop.click(None, None, None, cancels=[event1, event2])
-# Use a queue for smooth streaming and handling multiple users
 demo.queue()
-# Set share=True to create a public link, necessary for most cloud environments
-demo.launch(share=True)

 from transformers import AutoModel, AutoProcessor
 from transformers import StoppingCriteria, TextIteratorStreamer, StoppingCriteriaList
 device = "cuda:0" if torch.cuda.is_available() else "cpu"
 model = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True).to(device)
 processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True)
 class StopOnTokens(StoppingCriteria):
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        stop_ids = [151645]
         for stop_id in stop_ids:
             if input_ids[0][-1] == stop_id:
                 return True
         return False
 @torch.no_grad()
 def response(message, history, image):
     stop = StopOnTokens()
     messages = [{"role": "system", "content": "You are a helpful assistant."}]
     for user_msg, assistant_msg in history:
         messages.append({"role": "user", "content": user_msg})
+        messages.append({"role": "assistant", "content": assistant_msg})
+    if len(messages) == 1:
+        message = f" <image>{message}"
+    messages.append({"role": "user", "content": message})
+    model_inputs = processor.tokenizer.apply_chat_template(
         messages,
         add_generation_prompt=True,
         return_tensors="pt"
+    )
+    image = (
+            processor.feature_extractor(image)
+            .unsqueeze(0)
+    )
+    attention_mask = torch.ones(
+        1, model_inputs.shape[1] + processor.num_image_latents - 1
     )
+    model_inputs = {
+        "input_ids": model_inputs,
+        "images": image,
+        "attention_mask": attention_mask
+    }
+    model_inputs = {k: v.to(device) for k, v in model_inputs.items()}
+    streamer = TextIteratorStreamer(processor.tokenizer, timeout=30., skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
+        model_inputs,
         streamer=streamer,
         max_new_tokens=1024,
         stopping_criteria=StoppingCriteriaList([stop])
+        )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     history.append([message, ""])
     partial_response = ""
     for new_token in streamer:
         partial_response += new_token
         history[-1][1] = partial_response
+        yield history, gr.Button(visible=False), gr.Button(visible=True, interactive=True)
+with gr.Blocks() as demo:
     with gr.Row():
+        image = gr.Image(type="pil")
         with gr.Column():
+            chat = gr.Chatbot(show_label=False)
+            message = gr.Textbox(interactive=True, show_label=False, container=False)
             with gr.Row():
+                gr.ClearButton([chat, message])
+                stop = gr.Button(value="Stop", variant="stop", visible=False)
+                submit = gr.Button(value="Submit", variant="primary")
     with gr.Row():
         gr.Examples(
                 ["images/child.jpg", "Describe the image in one sentence."],
             ],
             [image, message],
+            label="Captioning"
         )
         gr.Examples(
             [
                 ["images/three_people.jpg", "What are these people doing?"]
             ],
             [image, message],
+            label="VQA"
         )
     response_handler = (
         response,
         [message, chat, image],
         [chat, submit, stop]
     )
     postresponse_handler = (
+        lambda: (gr.Button(visible=False), gr.Button(visible=True)),
         None,
+        [stop, submit]
     )
     event1 = message.submit(*response_handler)
     event1.then(*postresponse_handler)
     event2 = submit.click(*response_handler)
     event2.then(*postresponse_handler)
     stop.click(None, None, None, cancels=[event1, event2])
 demo.queue()
+demo.launch()