Spaces:
Running
on
Zero
Running
on
Zero
File size: 8,610 Bytes
1f01cb4 6247522 cfdf27b 6247522 1f01cb4 6247522 1f01cb4 6247522 1f01cb4 6247522 1f01cb4 cfdf27b 1f01cb4 6247522 cfdf27b 1f01cb4 cfdf27b 1f01cb4 6247522 1f01cb4 cfdf27b 1f01cb4 6247522 1f01cb4 6247522 1f01cb4 cfdf27b 1f01cb4 cfdf27b 1f01cb4 6247522 1f01cb4 d48f554 1f01cb4 6247522 1f01cb4 cfdf27b 6247522 cfdf27b 1f01cb4 cfdf27b 6247522 cfdf27b 6247522 cfdf27b 1f01cb4 251d5ef cfdf27b 6247522 cfdf27b 6247522 1f01cb4 cfdf27b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 |
import os
import time
from threading import Thread
import re
from PIL import Image, ImageDraw
import gradio as gr
import spaces
import torch
from transformers import (
Qwen2_5_VLForConditionalGeneration,
AutoProcessor,
TextIteratorStreamer,
)
# Constants for text generation
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# Load Lumian2-VLR-7B-Thinking
MODEL_ID_Y = "prithivMLmods/Lumian2-VLR-7B-Thinking"
processor = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_ID_Y,
trust_remote_code=True,
torch_dtype=torch.float16
).to(device).eval()
def parse_model_output(text: str):
"""
Parses the model output to extract the answer and bounding box coordinates.
"""
# Extract coordinates from the <think> block
think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
coordinates = []
if think_match:
think_content = think_match.group(1)
# Find all occurrences of (x, y) coordinates
coords_raw = re.findall(r'\((\d+),\s*(\d+)\)', think_content)
coordinates = [(int(x), int(y)) for x, y in coords_raw]
# Extract the answer from the <answer> block
answer_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
answer = answer_match.group(1).strip() if answer_match else text
return answer, coordinates
def draw_bounding_boxes(image: Image.Image, coordinates: list, box_size: int = 60, use_dotted_style: bool = False):
"""
Draws square bounding boxes on the image at the given coordinates.
"""
if not coordinates:
return image
img_with_boxes = image.copy()
draw = ImageDraw.Draw(img_with_boxes, "RGBA")
half_box = box_size // 2
for (x, y) in coordinates:
# Define the bounding box corners
x1 = x - half_box
y1 = y - half_box
x2 = x + half_box
y2 = y + half_box
if use_dotted_style:
# "Dotted like seaborn" - a semi-transparent fill with a solid outline
fill_color = (0, 100, 255, 60) # Light blue, semi-transparent
outline_color = (0, 0, 255) # Solid blue
draw.rectangle([x1, y1, x2, y2], fill=fill_color, outline=outline_color, width=2)
else:
# Default solid box
outline_color = (255, 0, 0) # Red
draw.rectangle([x1, y1, x2, y2], outline=outline_color, width=3)
return img_with_boxes
@spaces.GPU
def generate_image(text: str, image: Image.Image,
max_new_tokens: int,
temperature: float,
top_p: float,
top_k: int,
repetition_penalty: float,
draw_boxes: bool,
use_dotted_style: bool):
"""
Generates responses and draws bounding boxes based on model output.
Yields raw text, markdown-formatted text, and the processed image.
"""
if image is None:
yield "Please upload an image.", "Please upload an image.", None
return
# Yield the original image immediately for the output display
yield "", "", image
messages = [{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": text},
]
}]
prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = processor(
text=[prompt_full],
images=[image],
return_tensors="pt",
padding=True,
truncation=False,
max_length=MAX_INPUT_TOKEN_LENGTH
).to(device)
streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = {
**inputs,
"streamer": streamer,
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"top_p": top_p,
"top_k": top_k,
"repetition_penalty": repetition_penalty,
"do_sample": True
}
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
buffer = ""
for new_text in streamer:
buffer += new_text
time.sleep(0.01)
# During generation, yield text updates but keep the original image
yield buffer, buffer, image
# After generation is complete, parse the output and draw boxes
final_answer, coordinates = parse_model_output(buffer)
output_image = image
if draw_boxes and coordinates:
output_image = draw_bounding_boxes(image, coordinates, use_dotted_style=use_dotted_style)
# Yield the final result with the processed image
yield buffer, final_answer, output_image
# Define examples for image inference
image_examples = [
["Explain the content in detail.", "images/D.jpg"],
["Explain the content (ocr).", "images/O.jpg"],
["What is the core meaning of the poem?", "images/S.jpg"],
["Provide a detailed caption for the image.", "images/A.jpg"],
["Explain the pie-chart in detail.", "images/2.jpg"],
["Jsonify Data.", "images/1.jpg"],
]
css = """
.submit-btn {
background-color: #2980b9 !important;
color: white !important;
}
.submit-btn:hover {
background-color: #3498db !important;
}
.canvas-output {
border: 2px solid #4682B4;
border-radius: 10px;
padding: 20px;
}
"""
# Create the Gradio Interface
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
gr.Markdown("# **Lumian2-VLR-7B-Thinking Image Inference**")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## Image Inference")
image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
image_upload = gr.Image(type="pil", label="Image")
image_submit = gr.Button("Submit", elem_classes="submit-btn")
with gr.Accordion("Advanced options", open=False):
max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
gr.Examples(
examples=image_examples,
inputs=[image_query, image_upload]
)
with gr.Column(scale=2):
gr.Markdown("## Output")
with gr.Tabs():
with gr.TabItem("Image with Bounding Box"):
image_output = gr.Image(label="Processed Image")
with gr.TabItem("Raw Text"):
output = gr.Textbox(label="Raw Model Output", interactive=False, lines=10)
with gr.TabItem("Parsed Answer"):
markdown_output = gr.Markdown(label="Parsed Answer")
gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Qwen2.5-VL/discussions)")
gr.Markdown(
"""> [Lumian2-VLR-7B-Thinking](https://huggingface.co/prithivMLmods/Lumian2-VLR-7B-Thinking): The Lumian2-VLR-7B-Thinking model is a high-fidelity vision-language reasoning (experimental model) system designed for fine-grained multimodal understanding. Built on Qwen2.5-VL-7B-Instruct, this model enhances image captioning, and document comprehension through explicit grounded reasoning. It produces structured reasoning traces aligned with visual coordinates, enabling explainable multimodal reasoning."""
)
with gr.Row():
draw_boxes_checkbox = gr.Checkbox(label="Draw Bounding Boxes", value=True)
dotted_style_checkbox = gr.Checkbox(label="Use Dotted Style for Boxes", value=False)
image_submit.click(
fn=generate_image,
inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, draw_boxes_checkbox, dotted_style_checkbox],
outputs=[output, markdown_output, image_output]
)
if __name__ == "__main__":
demo.queue(max_size=50).launch(share=True)``` |