File size: 8,610 Bytes
1f01cb4
 
6247522
cfdf27b
 
6247522
1f01cb4
 
 
6247522
1f01cb4
 
 
 
 
 
 
6247522
 
1f01cb4
 
 
 
6247522
 
 
 
 
1f01cb4
 
 
 
cfdf27b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f01cb4
6247522
 
cfdf27b
 
 
 
 
 
 
1f01cb4
cfdf27b
 
1f01cb4
 
6247522
1f01cb4
 
cfdf27b
 
 
1f01cb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6247522
1f01cb4
6247522
1f01cb4
cfdf27b
1f01cb4
 
 
 
cfdf27b
 
 
 
 
 
 
 
 
 
 
 
1f01cb4
6247522
 
 
 
 
 
 
 
1f01cb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d48f554
 
1f01cb4
 
6247522
1f01cb4
cfdf27b
6247522
 
 
 
cfdf27b
1f01cb4
 
 
 
 
 
cfdf27b
 
 
 
6247522
 
cfdf27b
 
 
 
 
 
 
 
 
6247522
 
 
 
cfdf27b
1f01cb4
251d5ef
cfdf27b
 
 
 
 
6247522
 
cfdf27b
 
6247522
1f01cb4
 
cfdf27b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import os
import time
from threading import Thread
import re
from PIL import Image, ImageDraw

import gradio as gr
import spaces
import torch

from transformers import (
    Qwen2_5_VLForConditionalGeneration,
    AutoProcessor,
    TextIteratorStreamer,
)

# Constants for text generation
MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Load Lumian2-VLR-7B-Thinking
MODEL_ID_Y = "prithivMLmods/Lumian2-VLR-7B-Thinking"
processor = AutoProcessor.from_pretrained(MODEL_ID_Y, trust_remote_code=True)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    MODEL_ID_Y,
    trust_remote_code=True,
    torch_dtype=torch.float16
).to(device).eval()

def parse_model_output(text: str):
    """
    Parses the model output to extract the answer and bounding box coordinates.
    """
    # Extract coordinates from the <think> block
    think_match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
    coordinates = []
    if think_match:
        think_content = think_match.group(1)
        # Find all occurrences of (x, y) coordinates
        coords_raw = re.findall(r'\((\d+),\s*(\d+)\)', think_content)
        coordinates = [(int(x), int(y)) for x, y in coords_raw]

    # Extract the answer from the <answer> block
    answer_match = re.search(r"<answer>(.*?)</answer>", text, re.DOTALL)
    answer = answer_match.group(1).strip() if answer_match else text

    return answer, coordinates

def draw_bounding_boxes(image: Image.Image, coordinates: list, box_size: int = 60, use_dotted_style: bool = False):
    """
    Draws square bounding boxes on the image at the given coordinates.
    """
    if not coordinates:
        return image

    img_with_boxes = image.copy()
    draw = ImageDraw.Draw(img_with_boxes, "RGBA")
    
    half_box = box_size // 2

    for (x, y) in coordinates:
        # Define the bounding box corners
        x1 = x - half_box
        y1 = y - half_box
        x2 = x + half_box
        y2 = y + half_box
        
        if use_dotted_style:
            # "Dotted like seaborn" - a semi-transparent fill with a solid outline
            fill_color = (0, 100, 255, 60)  # Light blue, semi-transparent
            outline_color = (0, 0, 255) # Solid blue
            draw.rectangle([x1, y1, x2, y2], fill=fill_color, outline=outline_color, width=2)
        else:
            # Default solid box
            outline_color = (255, 0, 0) # Red
            draw.rectangle([x1, y1, x2, y2], outline=outline_color, width=3)
            
    return img_with_boxes

@spaces.GPU
def generate_image(text: str, image: Image.Image,
                   max_new_tokens: int,
                   temperature: float,
                   top_p: float,
                   top_k: int,
                   repetition_penalty: float,
                   draw_boxes: bool,
                   use_dotted_style: bool):
    """
    Generates responses and draws bounding boxes based on model output.
    Yields raw text, markdown-formatted text, and the processed image.
    """
    if image is None:
        yield "Please upload an image.", "Please upload an image.", None
        return

    # Yield the original image immediately for the output display
    yield "", "", image

    messages = [{
        "role": "user",
        "content": [
            {"type": "image", "image": image},
            {"type": "text", "text": text},
        ]
    }]
    prompt_full = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = processor(
        text=[prompt_full],
        images=[image],
        return_tensors="pt",
        padding=True,
        truncation=False,
        max_length=MAX_INPUT_TOKEN_LENGTH
    ).to(device)
    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
    generation_kwargs = {
        **inputs,
        "streamer": streamer,
        "max_new_tokens": max_new_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "top_k": top_k,
        "repetition_penalty": repetition_penalty,
        "do_sample": True
    }
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    buffer = ""
    for new_text in streamer:
        buffer += new_text
        time.sleep(0.01)
        # During generation, yield text updates but keep the original image
        yield buffer, buffer, image

    # After generation is complete, parse the output and draw boxes
    final_answer, coordinates = parse_model_output(buffer)
    
    output_image = image
    if draw_boxes and coordinates:
        output_image = draw_bounding_boxes(image, coordinates, use_dotted_style=use_dotted_style)
    
    # Yield the final result with the processed image
    yield buffer, final_answer, output_image

# Define examples for image inference
image_examples = [
    ["Explain the content in detail.", "images/D.jpg"],
    ["Explain the content (ocr).", "images/O.jpg"],
    ["What is the core meaning of the poem?", "images/S.jpg"],
    ["Provide a detailed caption for the image.", "images/A.jpg"],
    ["Explain the pie-chart in detail.", "images/2.jpg"],
    ["Jsonify Data.", "images/1.jpg"],
]

css = """
.submit-btn {
    background-color: #2980b9 !important;
    color: white !important;
}
.submit-btn:hover {
    background-color: #3498db !important;
}
.canvas-output {
    border: 2px solid #4682B4;
    border-radius: 10px;
    padding: 20px;
}
"""

# Create the Gradio Interface
with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
    gr.Markdown("# **Lumian2-VLR-7B-Thinking Image Inference**")
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("## Image Inference")
            image_query = gr.Textbox(label="Query Input", placeholder="Enter your query here...")
            image_upload = gr.Image(type="pil", label="Image")
            image_submit = gr.Button("Submit", elem_classes="submit-btn")
            
            with gr.Accordion("Advanced options", open=False):
                max_new_tokens = gr.Slider(label="Max new tokens", minimum=1, maximum=MAX_MAX_NEW_TOKENS, step=1, value=DEFAULT_MAX_NEW_TOKENS)
                temperature = gr.Slider(label="Temperature", minimum=0.1, maximum=4.0, step=0.1, value=0.6)
                top_p = gr.Slider(label="Top-p (nucleus sampling)", minimum=0.05, maximum=1.0, step=0.05, value=0.9)
                top_k = gr.Slider(label="Top-k", minimum=1, maximum=1000, step=1, value=50)
                repetition_penalty = gr.Slider(label="Repetition penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.2)
                
            gr.Examples(
                examples=image_examples,
                inputs=[image_query, image_upload]
            )

        with gr.Column(scale=2):
            gr.Markdown("## Output")
            with gr.Tabs():
                with gr.TabItem("Image with Bounding Box"):
                    image_output = gr.Image(label="Processed Image")
                with gr.TabItem("Raw Text"):
                    output = gr.Textbox(label="Raw Model Output", interactive=False, lines=10)
                with gr.TabItem("Parsed Answer"):
                    markdown_output = gr.Markdown(label="Parsed Answer")

            gr.Markdown("**Model Info 💻** | [Report Bug](https://huggingface.co/spaces/prithivMLmods/Qwen2.5-VL/discussions)")

            gr.Markdown(
                """> [Lumian2-VLR-7B-Thinking](https://huggingface.co/prithivMLmods/Lumian2-VLR-7B-Thinking): The Lumian2-VLR-7B-Thinking model is a high-fidelity vision-language reasoning (experimental model) system designed for fine-grained multimodal understanding. Built on Qwen2.5-VL-7B-Instruct, this model enhances image captioning, and document comprehension through explicit grounded reasoning. It produces structured reasoning traces aligned with visual coordinates, enabling explainable multimodal reasoning."""
            )

            with gr.Row():
                draw_boxes_checkbox = gr.Checkbox(label="Draw Bounding Boxes", value=True)
                dotted_style_checkbox = gr.Checkbox(label="Use Dotted Style for Boxes", value=False)


    image_submit.click(
        fn=generate_image,
        inputs=[image_query, image_upload, max_new_tokens, temperature, top_p, top_k, repetition_penalty, draw_boxes_checkbox, dotted_style_checkbox],
        outputs=[output, markdown_output, image_output]
    )

if __name__ == "__main__":
    demo.queue(max_size=50).launch(share=True)```