Spaces:

Fancy-MLLM
/

R1-Onevision

Runtime error

App Files Files Community

Fancy-MLLM commited on Feb 13, 2025

Commit

a5d2387

verified ·

1 Parent(s): 5762ea1

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -283

app.py CHANGED Viewed

@@ -1,240 +1,19 @@
-# import gradio as gr
-# from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor, TextIteratorStreamer
-# from threading import Thread
-# from qwen_vl_utils import process_vision_info
-# import torch
-# import time
-# # Check if a GPU is available
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# local_path = "Fancy-MLLM/R1-OneVision-7B"
-# # Load the model on the appropriate device (GPU if available, otherwise CPU)
-# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-#     local_path, torch_dtype="auto", device_map=device
-# )
-# processor = AutoProcessor.from_pretrained(local_path)
-# def generate_output(image, text, button_click):
-#     # Prepare input data
-#     messages = [
-#         {
-#             "role": "user",
-#             "content": [
-#                 {"type": "image", "image": image, 'min_pixels': 1003520, 'max_pixels': 12845056},
-#                 {"type": "text", "text": text},
-#             ],
-#         }
-#     ]
-#     # Prepare inputs for the model
-#     text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-#     image_inputs, video_inputs = process_vision_info(messages)
-#     inputs = processor(
-#         text=[text_input],
-#         images=image_inputs,
-#         videos=video_inputs,
-#         padding=True,
-#         return_tensors="pt",
-#     )
-#     # Move inputs to the same device as the model
-#     inputs = inputs.to(model.device)
-#     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-#     generation_kwargs = dict(
-#         **inputs,
-#         streamer=streamer,
-#         max_new_tokens=4096,
-#         top_p=0.001,
-#         top_k=1,
-#         temperature=0.01,
-#         repetition_penalty=1.0,
-#     )
-#     thread = Thread(target=model.generate, kwargs=generation_kwargs)
-#     thread.start()
-#     generated_text = ''
-#     try:
-#         for new_text in streamer:
-#             generated_text += new_text
-#             yield f"‎{generated_text}"
-#     except Exception as e:
-#         print(f"Error: {e}")
-#         yield f"Error occurred: {str(e)}"
-# Css = """
-# #output-markdown {
-#     overflow-y: auto;
-#     white-space: pre-wrap;
-#     word-wrap: break-word;
-# }
-# #output-markdown .math {
-#     overflow-x: auto;
-#     max-width: 100%;
-# }
-# .markdown-text {
-#     white-space: pre-wrap;
-#     word-wrap: break-word;
-# }
-# .markdown-output {
-#     min-height: 20vh;
-#     max-width: 100%;
-#     overflow-y: auto;
-# }
-# #qwen-md .katex-display { display: inline; }
-# #qwen-md .katex-display>.katex { display: inline; }
-# #qwen-md .katex-display>.katex>.katex-html { display: inline; }
-# """
-# with gr.Blocks(css=Css) as demo:
-#     gr.HTML("""<center><font size=8>🦖 R1-OneVision Demo</center>""")
-#     with gr.Row():
-#         with gr.Column():
-#             input_image = gr.Image(type="pil", label="Upload")  # **改回 PIL 处理**
-#             input_text = gr.Textbox(label="Input your question")
-#             with gr.Row():
-#                 clear_btn = gr.ClearButton([input_image, input_text])
-#                 submit_btn = gr.Button("Submit", variant="primary")
-#         with gr.Column():
-#             output_text = gr.Markdown(elem_id="qwen-md", container=True, elem_classes="markdown-output")
-#     submit_btn.click(fn=generate_output, inputs=[input_image, input_text], outputs=output_text)
-# demo.launch(share=False)
-# import gradio as gr
-# from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TextIteratorStreamer
-# from transformers.image_utils import load_image
-# from threading import Thread
-# import time
-# import torch
-# import spaces
-# MODEL_ID = "Fancy-MLLM/R1-OneVision-7B"
-# processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
-# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-#     MODEL_ID,
-#     trust_remote_code=True,
-#     torch_dtype=torch.bfloat16
-# ).to("cuda").eval()
-# @spaces.GPU(duration=200)
-# def model_inference(input_dict, history):
-#     text = input_dict["text"]
-#     files = input_dict["files"]
-#     # Load images if provided
-#     if len(files) > 1:
-#         images = [load_image(image) for image in files]
-#     elif len(files) == 1:
-#         images = [load_image(files[0])]
-#     else:
-#         images = []
-#     # Validate input
-#     if text == "" and not images:
-#         gr.Error("Please input a query and optionally image(s).")
-#         return
-#     if text == "" and images:
-#         gr.Error("Please input a text query along with the image(s).")
-#         return
-#     # Prepare messages for the model
-#     messages = [
-#         {
-#             "role": "user",
-#             "content": [
-#                 *[{"type": "image", "image": image} for image in images],
-#                 {"type": "text", "text": text},
-#             ],
-#         }
-#     ]
-#     # Apply chat template and process inputs
-#     prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-#     inputs = processor(
-#         text=[prompt],
-#         images=images if images else None,
-#         return_tensors="pt",
-#         padding=True,
-#     ).to("cuda")
-#     # # Set up streamer for real-time output
-#     # streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-#     # generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=2048)
-#     # # Start generation in a separate thread
-#     # thread = Thread(target=model.generate, kwargs=generation_kwargs)
-#     # thread.start()
-#     # # Stream the output
-#     # buffer = ""
-#     # yield "Thinking..."
-#     # for new_text in streamer:
-#     #     buffer += new_text
-#     #     time.sleep(0.01)
-#     #     yield buffer
-#     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-#     generation_kwargs = dict(
-#         **inputs,
-#         streamer=streamer,
-#         max_new_tokens=2048,
-#         top_p=0.001,
-#         top_k=1,
-#         temperature=0.01,
-#         repetition_penalty=1.0,
-#     )
-#     thread = Thread(target=model.generate, kwargs=generation_kwargs)
-#     thread.start()
-#     generated_text = ''
-#     try:
-#         for new_text in streamer:
-#             generated_text += new_text
-#             yield generated_text
-#     except Exception as e:
-#         print(f"Error: {e}")
-#         yield f"Error occurred: {str(e)}"
-# examples = [
-#     [{"text": "Hint: Please answer the question and provide the final answer at the end. Question: Which number do you have to write in the last daisy?", "files": ["5.jpg"]}]
-# ]
-# demo = gr.ChatInterface(
-#     fn=model_inference,
-#     description="# **🦖 Fancy-MLLM/R1-OneVision-7B**",
-#     examples=examples,
-#     textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"], file_count="multiple"),
-#     stop_btn="Stop Generation",
-#     multimodal=True,
-#     cache_examples=False,
-# )
-# demo.launch(debug=True)
 import os
 from datetime import datetime
 import time
-from threading import Thread
 # Third-party imports
 import numpy as np
 import torch
 from PIL import Image
 import gradio as gr
 import spaces
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
-    AutoProcessor,
-    TextIteratorStreamer
 )
 # Local imports
@@ -250,6 +29,7 @@ else:
 print(f"[INFO] Using device: {device}")
 def array_to_image_path(image_array):
     if image_array is None:
         raise ValueError("No image provided. Please upload an image before submitting.")
@@ -267,19 +47,19 @@ def array_to_image_path(image_array):
     full_path = os.path.abspath(filename)
     return full_path
 models = {
-    "Fancy-MLLM/R1-OneVision-7B": Qwen2_5_VLForConditionalGeneration.from_pretrained("Fancy-MLLM/R1-OneVision-7B",
                                                                                       trust_remote_code=True,
                                                                                       torch_dtype="auto",
                                                                                       device_map="auto").eval(),
 }
 processors = {
-    "Fancy-MLLM/R1-OneVision-7B": AutoProcessor.from_pretrained("Fancy-MLLM/R1-OneVision-7B", trust_remote_code=True),
 }
-DESCRIPTION = "[🦖 Fancy-MLLM/R1-OneVision-7B Demo]"
 kwargs = {}
 kwargs['torch_dtype'] = torch.bfloat16
@@ -289,70 +69,55 @@ assistant_prompt = '<|assistant|>\n'
 prompt_suffix = "<|end|>\n"
 @spaces.GPU
-def model_inference(input_dict, history):
-    text = input_dict["text"]
-    files = input_dict["files"]
-    # Load images if provided
-    images = []
-    if len(files) > 0:
-        images = [array_to_image_path(image) for image in files]
-    # Validate input
-    if text == "" and not images:
-        yield "Error: Please input a query and optionally image(s)."
-        return
-    if text == "" and images:
-        yield "Error: Please input a text query along with the image(s)."
-        return
-    # Prepare messages for the model
     messages = [
-        {
             "role": "user",
             "content": [
-                *[{"type": "image", "image": image} for image in images],
-                {"type": "text", "text": text},
             ],
         }
     ]
-    # Apply chat template and process inputs
-    prompt = processors["Fancy-MLLM/R1-OneVision-7B"].apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
     image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processors["Fancy-MLLM/R1-OneVision-7B"](
-        text=[prompt],
         images=image_inputs,
         videos=video_inputs,
         padding=True,
         return_tensors="pt",
-    ).to(device)
-    # Set up streamer for real-time output
-    streamer = TextIteratorStreamer(processors["Fancy-MLLM/R1-OneVision-7B"], skip_prompt=True, skip_special_tokens=True)
-    # Define the generation parameters
-    generation_kwargs = dict(
-        **inputs,
-        streamer=streamer,
-        max_new_tokens=2048,
-        top_p=0.001,
-        top_k=1,
-        temperature=0.01,
-        repetition_penalty=1.0,
     )
-    # Start generation in a separate thread
-    thread = Thread(target=models["Fancy-MLLM/R1-OneVision-7B"].generate, kwargs=generation_kwargs)
-    thread.start()
-    # Stream the output
-    buffer = ""
-    yield "Thinking..."
-    for new_text in streamer:
-        buffer += new_text
-        time.sleep(0.01)
-        yield buffer
 css = """
   #output {
@@ -364,21 +129,20 @@ css = """
 with gr.Blocks(css=css) as demo:
     gr.Markdown(DESCRIPTION)
-    with gr.Tab(label="R1-OneVision-7B Input"):
         with gr.Row():
             with gr.Column():
-                input_img = gr.Image(label="Input Picture", type="numpy", elem_id="image_input")
                 model_selector = gr.Dropdown(choices=list(models.keys()),
                                              label="Model",
-                                             value="Fancy-MLLM/R1-OneVision-7B")
                 text_input = gr.Textbox(label="Text Prompt")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
-                output_text = gr.Textbox(label="Output Text", elem_id="output_text", lines=10)
-        submit_btn.click(model_inference, [input_img, text_input, model_selector], [output_text])
 demo.queue(api_open=False)
 demo.launch(debug=True)

 import os
 from datetime import datetime
+import subprocess
 import time
 # Third-party imports
 import numpy as np
 import torch
 from PIL import Image
+import accelerate
 import gradio as gr
 import spaces
 from transformers import (
     Qwen2_5_VLForConditionalGeneration,
+    AutoTokenizer,
+    AutoProcessor
 )
 # Local imports
 print(f"[INFO] Using device: {device}")
 def array_to_image_path(image_array):
     if image_array is None:
         raise ValueError("No image provided. Please upload an image before submitting.")
     full_path = os.path.abspath(filename)
     return full_path
 models = {
+    "Fancy-MLLM/R1-Onevision-7B": Qwen2_5_VLForConditionalGeneration.from_pretrained("Fancy-MLLM/R1-Onevision-7B",
                                                                                       trust_remote_code=True,
                                                                                       torch_dtype="auto",
                                                                                       device_map="auto").eval(),
 }
 processors = {
+    "Fancy-MLLM/R1-Onevision-7B": AutoProcessor.from_pretrained("Fancy-MLLM/R1-Onevision-7B", trust_remote_code=True),
 }
+DESCRIPTION = "[🦖 Fancy-MLLM/R1-Onevision-7B Demo]"
 kwargs = {}
 kwargs['torch_dtype'] = torch.bfloat16
 prompt_suffix = "<|end|>\n"
 @spaces.GPU
+def run_example(image, text_input=None, model_id=None):
+    start_time = time.time()
+    image_path = array_to_image_path(image)
+    print(image_path)
+    model = models[model_id]
+    processor = processors[model_id]
+    image = Image.fromarray(image).convert("RGB")
     messages = [
+    {
             "role": "user",
             "content": [
+                {
+                    "type": "image",
+                    "image": image_path,
+                },
+                {"type": "text", "text": text_input},
             ],
         }
     ]
+    # Preparation for inference
+    text = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
     image_inputs, video_inputs = process_vision_info(messages)
+    inputs = processor(
+        text=[text],
         images=image_inputs,
         videos=video_inputs,
         padding=True,
         return_tensors="pt",
+    )
+    inputs = inputs.to(device)
+    # Inference: Generation of the output
+    generated_ids = model.generate(**inputs, max_new_tokens=2048)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+    ]
+    output_text = processor.batch_decode(
+        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
     )
+    end_time = time.time()
+    total_time = round(end_time - start_time, 2)
+    return output_text[0], total_time
 css = """
   #output {
 with gr.Blocks(css=css) as demo:
     gr.Markdown(DESCRIPTION)
+    with gr.Tab(label="R1-Onevision-7B Input"):
         with gr.Row():
             with gr.Column():
+                input_img = gr.Image(label="Input Picture")
                 model_selector = gr.Dropdown(choices=list(models.keys()),
                                              label="Model",
+                                             value="Fancy-MLLM/R1-Onevision-7B")
                 text_input = gr.Textbox(label="Text Prompt")
                 submit_btn = gr.Button(value="Submit")
             with gr.Column():
+                output_text = gr.Textbox(label="Output Text")
+                time_taken = gr.Textbox(label="Time taken for processing + inference")
+        submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text, time_taken])
 demo.queue(api_open=False)
 demo.launch(debug=True)