File size: 10,736 Bytes
0cb9ad5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0b964b6
0cb9ad5
 
 
 
 
 
 
bca5918
 
 
 
 
 
 
 
0cb9ad5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bca5918
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
import os
import os.path as osp

import gradio as gr

HEADER = """
# Penguin-VL Gradio Interface

Developed by [Penguin-VL](https://github.com/tencent-ailab/Penguin-VL) team at Tencent AI Lab.

Note: speed on ZeroGPU does not reflect real model speed and may be influenced by the shared environment. For stable and fast Gradio Space deployment and running, please visit [the local UI instructions](https://github.com/tencent-ailab/Penguin-VL?tab=readme-ov-file#-gradio-demo-local-ui). For usage examples and expected results, please refer to [here](https://github.com/tencent-ailab/Penguin-VL/blob/master/inference/notebooks/01_penguinvl_inference_recipes.public.ipynb).

Please login with your Hugging Face account first. We provide some example images and videos for easier trials.
"""


class PenguinVLQwen3GradioInterface(object):

    def __init__(self, model_client, example_dir=None, default_system_prompt="You are a helpful assistant developed by Tencent AI Lab PenguinVL team.", **server_kwargs):
        self.model_client = model_client
        self.server_kwargs = server_kwargs
        self.default_system_prompt = (default_system_prompt or "").strip()

        self.image_formats = ("png", "jpg", "jpeg")
        self.video_formats = ("mp4", "mov")
        image_examples, video_examples = [], []
        if example_dir is not None:
            example_files = [
                osp.join(example_dir, f) for f in os.listdir(example_dir)
            ]
            for example_file in example_files:
                if example_file.endswith(self.image_formats):
                    image_examples.append([example_file])
                elif example_file.endswith(self.video_formats):
                    video_examples.append([example_file])

        with gr.Blocks() as self.interface:
            gr.Markdown(HEADER)
            with gr.Row():
                chatbot_kwargs = {"elem_id": "chatbot", "height": 710}
                try:
                    chatbot = gr.Chatbot(type="messages", **chatbot_kwargs)
                except TypeError:
                    # Gradio 6 uses OpenAI-style messages by default and removed the `type` arg.
                    chatbot = gr.Chatbot(**chatbot_kwargs)

                with gr.Column():
                    with gr.Tab(label="Input"):

                        with gr.Row():
                            input_video = gr.Video(sources=["upload"], label="Upload Video")
                            input_image = gr.Image(sources=["upload"], type="filepath", label="Upload Image")
                        
                        if len(image_examples):
                            gr.Examples(image_examples, inputs=[input_image], label="Example Images")
                        if len(video_examples):
                            gr.Examples(video_examples, inputs=[input_video], label="Example Videos")

                        input_text = gr.Textbox(label="Input Text", placeholder="Type your message here and press enter to submit")

                        submit_button = gr.Button("Generate")

                    with gr.Tab(label="Configure"):
                        with gr.Accordion("Prompt Config", open=True):
                            system_prompt = gr.Textbox(
                                value=self.default_system_prompt,
                                label="System Prompt",
                                lines=4,
                                placeholder="Optional: system instruction prepended to each request",
                            )

                        with gr.Accordion("Generation Config", open=True):
                            do_sample = gr.Checkbox(value=True, label="Do Sample")
                            temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, label="Temperature")
                            top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
                            max_new_tokens = gr.Slider(minimum=0, maximum=4096, value=1536, step=1, label="Max New Tokens")

                        with gr.Accordion("Video Config", open=True):
                            fps = gr.Slider(minimum=0.0, maximum=10.0, value=1, label="FPS")
                            max_frames = gr.Slider(minimum=0, maximum=256, value=180, step=1, label="Max Frames")

            input_video.change(self._on_video_upload, [chatbot, input_video], [chatbot, input_video])
            input_image.change(self._on_image_upload, [chatbot, input_image], [chatbot, input_image])
            input_text.submit(
                self._predict,
                [
                    chatbot, input_text, system_prompt, do_sample, temperature, top_p, max_new_tokens,
                    fps, max_frames,
                ],
                [chatbot, input_text],
            )
            submit_button.click(
                self._predict,
                [
                    chatbot, input_text, system_prompt, do_sample, temperature, top_p, max_new_tokens,
                    fps, max_frames,
                ],
                [chatbot, input_text],
            )

    def _on_video_upload(self, messages, video):
        messages = messages or []
        if video is not None:
            # messages.append({"role": "user", "content": gr.Video(video)})
            messages.append({"role": "user", "content": {"path": video}})
        return messages, None

    def _on_image_upload(self, messages, image):
        messages = messages or []
        if image is not None:
            # messages.append({"role": "user", "content": gr.Image(image)})
            messages.append({"role": "user", "content": {"path": image}})
        return messages, None

    def _on_text_submit(self, messages, text):
        messages = messages or []
        messages.append({"role": "user", "content": text})
        return messages, ""

    def _extract_media_path(self, content):
        if isinstance(content, dict):
            if content.get("type") == "text" and isinstance(content.get("text"), str):
                raise ValueError(f"Text content is not media: {content}")
            media_path = content.get("path")
            if media_path:
                return media_path
            for value in content.values():
                try:
                    return self._extract_media_path(value)
                except ValueError:
                    continue

        if isinstance(content, (list, tuple)) and len(content) > 0:
            for item in content:
                try:
                    return self._extract_media_path(item)
                except ValueError:
                    continue

        raise ValueError(f"Unsupported media content: {content}")

    def _extract_text_content(self, content):
        if isinstance(content, str):
            return content

        if isinstance(content, dict):
            if content.get("type") == "text" and isinstance(content.get("text"), str):
                return content["text"]
            text = content.get("text")
            if isinstance(text, str):
                return text

        if isinstance(content, (list, tuple)) and len(content) > 0:
            text_parts = []
            for item in content:
                try:
                    text_parts.append(self._extract_text_content(item))
                except ValueError:
                    continue
            if text_parts:
                return "\n".join(part for part in text_parts if part)

        raise ValueError(f"Unsupported text content: {content}")

    def _normalize_user_content(self, content, fps, max_frames):
        if isinstance(content, str):
            return [{"type": "text", "text": content}]

        if isinstance(content, (list, tuple)):
            normalized_items = []
            for item in content:
                normalized_items.extend(self._normalize_user_content(item, fps, max_frames))
            return normalized_items

        if isinstance(content, dict):
            try:
                text = self._extract_text_content(content)
            except ValueError:
                text = None
            else:
                return [{"type": "text", "text": text}]

            media_path = self._extract_media_path(content)
            media_ext = osp.splitext(media_path)[1].lower().lstrip(".")
            if media_ext in self.video_formats:
                return [{"type": "video", "video": {"video_path": media_path, "fps": fps, "max_frames": max_frames}}]
            if media_ext in self.image_formats:
                return [{"type": "image", "image": {"image_path": media_path}}]
            raise ValueError(f"Unsupported media type: {media_path}")

        raise ValueError(f"Unsupported user content: {content}")

    def _predict(self, messages, input_text, system_prompt, do_sample, temperature, top_p, max_new_tokens,
                 fps, max_frames):
        messages = list(messages or [])
        input_text = input_text or ""
        if input_text and len(input_text) > 0:
            messages.append({"role": "user", "content": input_text})
        new_messages = []
        active_system_prompt = (system_prompt or self.default_system_prompt).strip()
        if active_system_prompt:
            new_messages.append({
                "role": "system",
                "content": [{"type": "text", "text": active_system_prompt}],
            })

        contents = []
        for message in messages:
            if message["role"] == "assistant":
                if len(contents):
                    new_messages.append({"role": "user", "content": contents})
                    contents = []
                new_messages.append(message)
            elif message["role"] == "user":
                contents.extend(self._normalize_user_content(message["content"], fps, max_frames))

        if len(contents):
            new_messages.append({"role": "user", "content": contents})

        if len(new_messages) == 0 or new_messages[-1]["role"] != "user":
            return messages

        generation_config = {
            "do_sample": do_sample,
            "temperature": temperature,
            "top_p": top_p,
            "max_new_tokens": max_new_tokens
        }

        response = self.model_client.submit({"conversation": new_messages, "generation_config": generation_config})
        if isinstance(response, str):
            messages.append({"role": "assistant", "content": response})
            yield messages, ""
            return

        messages.append({"role": "assistant", "content": ""})
        for token in response:
            messages[-1]['content'] += token
            yield messages, ""

    def launch(self):
        self.interface.launch(**self.server_kwargs)