Spaces:
Running on Zero
Running on Zero
Sync Space app to tencent/Penguin-VL
Browse files- .gitattributes +9 -0
- README.md +297 -12
- app.py +23 -0
- assets/inputs/2b_table_result.png +3 -0
- assets/inputs/chart_understanding.png +3 -0
- assets/inputs/desert.jpg +3 -0
- assets/inputs/horse_poet.png +3 -0
- assets/inputs/leetcode.png +3 -0
- assets/inputs/newspaper.png +3 -0
- assets/inputs/polar_bear.mp4 +3 -0
- assets/inputs/sora.png +3 -0
- assets/inputs/video-example.mp4 +3 -0
- inference/interface/__init__.py +1 -0
- inference/interface/gradio_interface.py +235 -0
- inference/launch_gradio_demo.py +62 -0
- inference/server/__init__.py +2 -0
- inference/server/direct_client.py +176 -0
- inference/server/plain_server.py +279 -0
- packages.txt +1 -0
- pre-requirements.txt +5 -0
- requirements.txt +37 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
assets/inputs/2b_table_result.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/inputs/chart_understanding.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/inputs/desert.jpg filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
assets/inputs/horse_poet.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
assets/inputs/leetcode.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
assets/inputs/newspaper.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
assets/inputs/polar_bear.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
assets/inputs/sora.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
assets/inputs/video-example.mp4 filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,15 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
---
|
| 14 |
|
| 15 |
-
|
|
|
|
| 1 |
+
<p align="center">
|
| 2 |
+
<img src="assets/logo.png" width="150" style="margin-bottom: 0.2;"/>
|
| 3 |
+
</p>
|
| 4 |
+
|
| 5 |
+
<h3 align="center">Penguin-VL: Exploring the Efficiency Limits of VLM with LLM-based Vision Encoders</h3>
|
| 6 |
+
|
| 7 |
+
<h5 align="center">
|
| 8 |
+
|
| 9 |
+
[](https://huggingface.co/tencent/Penguin-VL-2B)
|
| 10 |
+
[](https://huggingface.co/tencent/Penguin-VL-8B)
|
| 11 |
+
[](https://huggingface.co/tencent/Penguin-Encoder) <br>
|
| 12 |
+
[](https://huggingface.co/spaces/lkeab/Penguin-VL-8B)
|
| 13 |
+
[](https://huggingface.co/papers/xxx.xxxx)
|
| 14 |
+
[](https://arxiv.org/abs/xxx.xxxx)
|
| 15 |
+
</h5>
|
| 16 |
+
|
| 17 |
+
---
|
| 18 |
+
|
| 19 |
+
## 📰 News
|
| 20 |
+
|
| 21 |
+
* **[2025.03]** Release inference code, vLLM plugin, and Gradio demo for Penguin-VL.
|
| 22 |
+
* **[2025.03]** Release [Penguin-VL-2B](https://huggingface.co/tencent/Penguin-VL-2B), [Penguin-VL-8B](https://huggingface.co/tencent/Penguin-VL-8B), and [Penguin Vision Encoder](https://huggingface.co/tencent/Penguin-Encoder) on Hugging Face.
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
## ✨ Overview
|
| 27 |
+
|
| 28 |
+
**Penguin-VL** is a compact vision-language model family built to study how far multimodal efficiency can be pushed by redesigning the **vision encoder**, rather than only scaling data or model size.
|
| 29 |
+
|
| 30 |
+
Most modern VLMs rely on vision encoders pretrained with large-scale **contrastive objectives** such as CLIP or SigLIP. Penguin-VL argues that this setup can be suboptimal for multimodal reasoning because contrastive learning favors coarse category-level invariances over the fine-grained signals needed for **OCR, document understanding, dense captioning, and complex reasoning**. Instead, Penguin-VL introduces **Penguin-Encoder**, a vision encoder **initialized from a text-only LLM**, so the visual backbone starts closer to the language model representation space and learns more data-efficiently.
|
| 31 |
+
|
| 32 |
+
<p align="center">
|
| 33 |
+
<img src="assets/framework.png" alt="Penguin-VL framework overview" width="920"/>
|
| 34 |
+
</p>
|
| 35 |
+
<p align="center">
|
| 36 |
+
<em>Framework overview of Penguin-VL: an LLM-initialized vision encoder, mixed-supervision pretraining, and efficient video token compression.</em>
|
| 37 |
+
</p>
|
| 38 |
+
|
| 39 |
+
### Highlights
|
| 40 |
+
|
| 41 |
+
- **LLM → Vision Encoder initialization (Penguin-Encoder)**
|
| 42 |
+
Initialize the vision encoder from a text-only LLM (e.g., Qwen3-0.6B), convert causal attention to **bidirectional attention**, and add **2D-RoPE** for variable-resolution vision tokens.
|
| 43 |
+
|
| 44 |
+
- **Mixed-supervision encoder pretraining**
|
| 45 |
+
Warm up the LLM-initialized encoder with a reconstruction/distillation objective (amplitude / direction / relation losses) to inject visual knowledge stably, then switch to high-resolution alignment.
|
| 46 |
+
|
| 47 |
+
- **Video efficiency via Temporal Redundancy-Aware (TRA) token compression**
|
| 48 |
+
Dynamically allocate token budgets across **key frames vs. intermediate frames** under a global token budget to scale to long videos more efficiently.
|
| 49 |
+
|
| 50 |
+
- **Unified training recipe**
|
| 51 |
+
A low-to-high resolution curriculum + instruction tuning strategy that balances image and video capabilities at compact scale.
|
| 52 |
+
|
| 53 |
+
---
|
| 54 |
+
|
| 55 |
+
## 📈 Results
|
| 56 |
+
|
| 57 |
+
Penguin-VL-2B delivers a strong accuracy-efficiency tradeoff across image and video benchmarks, with especially solid gains on OCR-heavy and reasoning-heavy tasks where fine-grained visual understanding matters most.
|
| 58 |
+
|
| 59 |
+
<p align="center">
|
| 60 |
+
<img src="assets/2b_results.png" alt="Penguin-VL-2B benchmark results" width="980"/>
|
| 61 |
+
</p>
|
| 62 |
+
<p align="center">
|
| 63 |
+
<em>Benchmark snapshot for Penguin-VL-2B across image and video evaluation suites.</em>
|
| 64 |
+
</p>
|
| 65 |
+
|
| 66 |
+
The released checkpoints and encoder weights are listed below.
|
| 67 |
+
|
| 68 |
+
---
|
| 69 |
+
|
| 70 |
+
## 📦 Model Zoo
|
| 71 |
+
|
| 72 |
+
| Model | Hugging Face |
|
| 73 |
+
| :---- | :----------- |
|
| 74 |
+
| **Penguin-VL-2B** | [tencent/Penguin-VL-2B](https://huggingface.co/tencent/Penguin-VL-2B) |
|
| 75 |
+
| **Penguin-VL-8B** | [tencent/Penguin-VL-8B](https://huggingface.co/tencent/Penguin-VL-8B) |
|
| 76 |
+
| **Penguin Vision Encoder** | [tencent/Penguin-Encoder](https://huggingface.co/tencent/Penguin-Encoder) |
|
| 77 |
+
|
| 78 |
+
---
|
| 79 |
+
|
| 80 |
+
## 🛠️ Environment Setup
|
| 81 |
+
|
| 82 |
+
### Requirements
|
| 83 |
+
|
| 84 |
+
- **Python** = 3.11.13 (recommended)
|
| 85 |
+
- **PyTorch** ≥ 2.5 (CUDA 12.4 recommended)
|
| 86 |
+
- **CUDA** ≥ 11.8
|
| 87 |
+
|
| 88 |
+
### Installation
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
# Clone the repository
|
| 92 |
+
git clone <repo_url>
|
| 93 |
+
cd <repo_name>
|
| 94 |
+
|
| 95 |
+
# Recommended: create and activate a clean conda environment
|
| 96 |
+
conda create -n PenguinVL python=3.11.13 -y
|
| 97 |
+
conda activate PenguinVL
|
| 98 |
+
|
| 99 |
+
# INSTALL ffmpeg if you don't have it on your system
|
| 100 |
+
conda install ffmpeg -y # optional
|
| 101 |
+
|
| 102 |
+
# Install dependencies (inference + Gradio demo)
|
| 103 |
+
pip install -r requirements.txt
|
| 104 |
+
|
| 105 |
+
# Install Flash Attention (recommended for faster inference)
|
| 106 |
+
pip install flash-attn==2.8.3 --no-build-isolation
|
| 107 |
+
```
|
| 108 |
+
|
| 109 |
+
### Version Notes
|
| 110 |
+
|
| 111 |
+
| Use Case | Recommended |
|
| 112 |
+
| :------- | :---------- |
|
| 113 |
+
| **Transformers inference** | `transformers==4.51.3` |
|
| 114 |
+
| **vLLM inference** | Install vLLM separately (see [§ vLLM Inference](#-vllm-inference)) |
|
| 115 |
+
|
| 116 |
+
---
|
| 117 |
+
|
| 118 |
+
## 🤖 Inference (Transformers)
|
| 119 |
+
|
| 120 |
+
Use HuggingFace `AutoModelForCausalLM` + `AutoProcessor` for image, video, and text.
|
| 121 |
+
|
| 122 |
+
```bash
|
| 123 |
+
python inference/example_penguinvl.py
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
You can provide a customized `--model-path` argument to the script (default: `tencent/Penguin-VL-8B`). Supported formats:
|
| 127 |
+
|
| 128 |
+
- **Video:** `type: "video"` with `video_path`, `fps`, `max_frames`
|
| 129 |
+
- **Image:** `type: "image"` with `image_path`
|
| 130 |
+
- **Mixed:** image + video + text in one conversation
|
| 131 |
+
- **Text-only:** plain text dialogue
|
| 132 |
+
|
| 133 |
+
---
|
| 134 |
+
|
| 135 |
+
## 📓 Cookbook
|
| 136 |
+
|
| 137 |
+
Checkout the inference notebook for a GitHub-friendly walkthrough of Penguin-VL across diverse tasks.
|
| 138 |
+
Unlike a multi-notebook cookbook, Penguin-VL currently provides **one consolidated notebook** that covers multiple representative examples in a single place.
|
| 139 |
+
|
| 140 |
+
| Notebook | Description |
|
| 141 |
+
| :------- | :---------- |
|
| 142 |
+
| [Inference Recipes](inference/notebooks/01_penguinvl_inference_recipes.public.ipynb) | Demonstrations of Penguin-VL for **visual code generation**, **OCR/document parsing**, **creative image understanding**, **table extraction**, **multi-round chart analysis**, **multi-round video understanding**, **mixed video+image prompting**, and a **text-only baseline**. |
|
| 143 |
+
|
| 144 |
+
If you want to re-execute the notebook locally and regenerate the GitHub-previewable output:
|
| 145 |
+
|
| 146 |
+
```bash
|
| 147 |
+
export PENGUIN_VL_MODEL_PATH=tencent/Penguin-VL-8B
|
| 148 |
+
|
| 149 |
+
jupyter nbconvert \
|
| 150 |
+
--to notebook \
|
| 151 |
+
--execute \
|
| 152 |
+
--output 01_penguinvl_inference_recipes.public.ipynb \
|
| 153 |
+
--ExecutePreprocessor.timeout=-1 \
|
| 154 |
+
--ExecutePreprocessor.kernel_name=penguinvl \
|
| 155 |
+
inference/notebooks/01_penguinvl_inference_recipes.source.ipynb
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
The clean source notebook lives at [inference/notebooks/01_penguinvl_inference_recipes.source.ipynb](inference/notebooks/01_penguinvl_inference_recipes.source.ipynb).
|
| 159 |
+
|
| 160 |
---
|
| 161 |
+
|
| 162 |
+
## ⚡ vLLM Inference
|
| 163 |
+
|
| 164 |
+
> Installing **vLLM 0.11.0** requires **PyTorch 2.8** and the corresponding compatible version of **Flash Attention**. This setup may different from the default Transformers inference environment (which recommends PyTorch ≥ 2.5). You may need to create a separate environment or upgrade dependencies accordingly to avoid version conflicts.
|
| 165 |
+
|
| 166 |
+
### Environment
|
| 167 |
+
|
| 168 |
+
- The vLLM plugin targets **vLLM 0.11.0** (`penguinvl/plugin/vllm/v0_11_0/`).
|
| 169 |
+
- vLLM is not in `requirements.txt` by default; install it separately:
|
| 170 |
+
|
| 171 |
+
```bash
|
| 172 |
+
pip install vllm==0.11.0
|
| 173 |
+
```
|
| 174 |
+
|
| 175 |
+
**Troubleshooting:** If you see `cannot find -lcuda` during flashinfer build:
|
| 176 |
+
|
| 177 |
+
```bash
|
| 178 |
+
export LIBRARY_PATH=/usr/lib/x86_64-linux-gnu:$LIBRARY_PATH
|
| 179 |
+
# or /usr/local/cuda/lib64 depending on your CUDA install
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
### Start vLLM Server
|
| 183 |
+
|
| 184 |
+
```bash
|
| 185 |
+
# Single GPU
|
| 186 |
+
python -m penguinvl.plugin.vllm serve tencent/Penguin-VL-8B
|
| 187 |
+
|
| 188 |
+
# Multi-GPU (e.g. 8B on 2 GPUs)
|
| 189 |
+
python -m penguinvl.plugin.vllm serve tencent/Penguin-VL-8B --port 8000 --tensor-parallel-size 2
|
| 190 |
+
```
|
| 191 |
+
|
| 192 |
+
Additional options: `--host`, `--max-model-len`, etc. (see vLLM 0.11 `serve` docs).
|
| 193 |
+
|
| 194 |
+
### vLLM Demo Script
|
| 195 |
+
|
| 196 |
+
Run text, image, video, and batch demos:
|
| 197 |
+
|
| 198 |
+
```bash
|
| 199 |
+
# All demos (single GPU)
|
| 200 |
+
CUDA_VISIBLE_DEVICES=0 python inference/test_vllm_infer.py --model-path tencent/Penguin-VL-8B
|
| 201 |
+
|
| 202 |
+
# Text-only
|
| 203 |
+
CUDA_VISIBLE_DEVICES=0 python inference/test_vllm_infer.py --model-path tencent/Penguin-VL-8B --demo text
|
| 204 |
+
|
| 205 |
+
# Image (requires --image-path)
|
| 206 |
+
CUDA_VISIBLE_DEVICES=0 python inference/test_vllm_infer.py --model-path tencent/Penguin-VL-8B --demo image --image-path assets/inputs/horse_poet.png
|
| 207 |
+
|
| 208 |
+
# Video
|
| 209 |
+
CUDA_VISIBLE_DEVICES=0 python inference/test_vllm_infer.py --model-path tencent/Penguin-VL-8B --demo video --video-path assets/inputs/polar_bear.mp4
|
| 210 |
+
|
| 211 |
+
# 8B with tensor parallelism (2 GPUs)
|
| 212 |
+
CUDA_VISIBLE_DEVICES=0,1 python inference/test_vllm_infer.py --model-path tencent/Penguin-VL-8B --tensor-parallel-size 2
|
| 213 |
+
```
|
| 214 |
+
|
| 215 |
+
| Argument | Description |
|
| 216 |
+
| :------- | :---------- |
|
| 217 |
+
| `--model-path` | HuggingFace model name or local path |
|
| 218 |
+
| `--demo` | `text` \| `image` \| `batch` \| `video` \| `all` |
|
| 219 |
+
| `--tensor-parallel-size` | Number of GPUs for tensor parallelism |
|
| 220 |
+
| `--max-new-tokens` | Max tokens to generate |
|
| 221 |
+
| `--max-model-len` | Max context length |
|
| 222 |
+
| `--gpu-memory-utilization` | GPU memory fraction (0–1) |
|
| 223 |
+
|
| 224 |
+
---
|
| 225 |
+
|
| 226 |
+
## 🤗 Gradio Demo (Local UI)
|
| 227 |
+
|
| 228 |
+
Launch a local web UI with image/video upload and chat.
|
| 229 |
+
|
| 230 |
+
### Quick Start
|
| 231 |
+
|
| 232 |
+
```bash
|
| 233 |
+
python inference/launch_gradio_demo.py --model-path tencent/Penguin-VL-8B
|
| 234 |
+
```
|
| 235 |
+
|
| 236 |
+
Then open **http://localhost:33666** (or your machine’s IP + port) in a browser.
|
| 237 |
+
|
| 238 |
+
### Options
|
| 239 |
+
|
| 240 |
+
| Option | Description | Default |
|
| 241 |
+
| :----- | :----------- | :------ |
|
| 242 |
+
| `--model-path` | Model path or HuggingFace ID | *required* |
|
| 243 |
+
| `--server-port` | Backend inference server port | 16667 |
|
| 244 |
+
| `--interface-port` | Gradio web UI port | 33666 |
|
| 245 |
+
| `--nproc` | Number of backend worker processes | 1 |
|
| 246 |
+
|
| 247 |
+
### Examples
|
| 248 |
+
|
| 249 |
+
```bash
|
| 250 |
+
# 2B model, default ports
|
| 251 |
+
python inference/launch_gradio_demo.py --model-path tencent/Penguin-VL-2B
|
| 252 |
+
|
| 253 |
+
# 8B model, custom UI port
|
| 254 |
+
python inference/launch_gradio_demo.py --model-path tencent/Penguin-VL-8B --interface-port 8080
|
| 255 |
+
|
| 256 |
+
# Multi-worker backend
|
| 257 |
+
python inference/launch_gradio_demo.py --model-path tencent/Penguin-VL-8B --nproc 4
|
| 258 |
+
```
|
| 259 |
+
|
| 260 |
+
---
|
| 261 |
+
|
| 262 |
+
## 📁 Project Structure
|
| 263 |
+
|
| 264 |
+
```text
|
| 265 |
+
.
|
| 266 |
+
├── penguinvl/ # Core model and processor code
|
| 267 |
+
│ ├── plugin/vllm/ # vLLM plugin (v0_11_0)
|
| 268 |
+
│ └── ...
|
| 269 |
+
├── inference/
|
| 270 |
+
│ ├── example_penguinvl.py # Transformers inference example
|
| 271 |
+
│ ├── test_vllm_infer.py # vLLM inference demo
|
| 272 |
+
│ ├── launch_gradio_demo.py # Gradio local demo
|
| 273 |
+
│ ├── notebooks/ # Executed and source Jupyter notebooks
|
| 274 |
+
│ ├── server/ # Backend for Gradio
|
| 275 |
+
│ ├── interface/ # Gradio UI
|
| 276 |
+
│ └── transformers_api/ # Transformers model/processor wrappers
|
| 277 |
+
├── assets/
|
| 278 |
+
│ ├── framework.png # README framework figure
|
| 279 |
+
│ ├── 2b_results.png # README benchmark figure
|
| 280 |
+
│ └── inputs/ # Demo images and videos
|
| 281 |
+
└── requirements.txt
|
| 282 |
+
```
|
| 283 |
+
|
| 284 |
+
---
|
| 285 |
+
|
| 286 |
+
## 📄 License
|
| 287 |
+
|
| 288 |
+
This project is released under the [Apache 2.0 License](LICENSE).
|
| 289 |
+
|
| 290 |
+
## 📚 Citation
|
| 291 |
+
|
| 292 |
+
If you use Penguin-VL in your research, please cite:
|
| 293 |
+
|
| 294 |
+
```bibtex
|
| 295 |
+
...
|
| 296 |
+
```
|
| 297 |
+
|
| 298 |
---
|
| 299 |
|
| 300 |
+
If you find this project useful, please consider giving it a ⭐ on GitHub. Issues and PRs are welcome.
|
app.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from inference.interface import PenguinVLQwen3GradioInterface
|
| 4 |
+
from inference.server import PenguinVLQwen3DirectClient
|
| 5 |
+
from inference.server.direct_client import ensure_flash_attn_installed
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def main():
|
| 9 |
+
ensure_flash_attn_installed()
|
| 10 |
+
model_client = PenguinVLQwen3DirectClient(
|
| 11 |
+
model_path=os.getenv("MODEL_PATH", "tencent/Penguin-VL-8B"),
|
| 12 |
+
)
|
| 13 |
+
interface = PenguinVLQwen3GradioInterface(
|
| 14 |
+
model_client,
|
| 15 |
+
example_dir=os.getenv("EXAMPLE_DIR", "./assets/inputs"),
|
| 16 |
+
server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
|
| 17 |
+
server_port=int(os.getenv("PORT", "7860")),
|
| 18 |
+
)
|
| 19 |
+
interface.launch()
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
if __name__ == "__main__":
|
| 23 |
+
main()
|
assets/inputs/2b_table_result.png
ADDED
|
Git LFS Details
|
assets/inputs/chart_understanding.png
ADDED
|
Git LFS Details
|
assets/inputs/desert.jpg
ADDED
|
Git LFS Details
|
assets/inputs/horse_poet.png
ADDED
|
Git LFS Details
|
assets/inputs/leetcode.png
ADDED
|
Git LFS Details
|
assets/inputs/newspaper.png
ADDED
|
Git LFS Details
|
assets/inputs/polar_bear.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef863db1456a4d6c20e8a8a346925d7bee2c4f4e2e6e7749f80f6b4961a2062c
|
| 3 |
+
size 2865660
|
assets/inputs/sora.png
ADDED
|
Git LFS Details
|
assets/inputs/video-example.mp4
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ae512cb2e0311fb0be4de6f9f6a646598e7ccb8da397ca5800ec2e9b8115bfd1
|
| 3 |
+
size 11846338
|
inference/interface/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from .gradio_interface import PenguinVLQwen3GradioInterface
|
inference/interface/gradio_interface.py
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import os.path as osp
|
| 3 |
+
|
| 4 |
+
import gradio as gr
|
| 5 |
+
|
| 6 |
+
HEADER = """
|
| 7 |
+
# Penguin-VL Gradio Interface
|
| 8 |
+
|
| 9 |
+
Developed by [Penguin-VL](https://github.com/tencent-ailab/Penguin-VL) team at Tencent AI Lab.
|
| 10 |
+
|
| 11 |
+
Note: speed on ZeroGPU does not reflect real model speed and may be influenced by the shared environment. For stable and fast Gradio Space deployment and running, please visit [the local UI instructions](https://github.com/tencent-ailab/Penguin-VL?tab=readme-ov-file#-gradio-demo-local-ui). For usage examples and expected results, please refer to [here](https://github.com/tencent-ailab/Penguin-VL/blob/master/inference/notebooks/01_penguinvl_inference_recipes.public.ipynb).
|
| 12 |
+
|
| 13 |
+
Please login with your Hugging Face account first. We provide some example images and videos for easier trials.
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class PenguinVLQwen3GradioInterface(object):
|
| 18 |
+
|
| 19 |
+
def __init__(self, model_client, example_dir=None, default_system_prompt="You are a helpful assistant developed by Tencent AI Lab PenguinVL team.", **server_kwargs):
|
| 20 |
+
self.model_client = model_client
|
| 21 |
+
self.server_kwargs = server_kwargs
|
| 22 |
+
self.default_system_prompt = (default_system_prompt or "").strip()
|
| 23 |
+
|
| 24 |
+
self.image_formats = ("png", "jpg", "jpeg")
|
| 25 |
+
self.video_formats = ("mp4", "mov")
|
| 26 |
+
image_examples, video_examples = [], []
|
| 27 |
+
if example_dir is not None:
|
| 28 |
+
example_files = [
|
| 29 |
+
osp.join(example_dir, f) for f in os.listdir(example_dir)
|
| 30 |
+
]
|
| 31 |
+
for example_file in example_files:
|
| 32 |
+
if example_file.endswith(self.image_formats):
|
| 33 |
+
image_examples.append([example_file])
|
| 34 |
+
elif example_file.endswith(self.video_formats):
|
| 35 |
+
video_examples.append([example_file])
|
| 36 |
+
|
| 37 |
+
with gr.Blocks() as self.interface:
|
| 38 |
+
gr.Markdown(HEADER)
|
| 39 |
+
with gr.Row():
|
| 40 |
+
chatbot_kwargs = {"elem_id": "chatbot", "height": 710}
|
| 41 |
+
try:
|
| 42 |
+
chatbot = gr.Chatbot(type="messages", **chatbot_kwargs)
|
| 43 |
+
except TypeError:
|
| 44 |
+
# Gradio 6 uses OpenAI-style messages by default and removed the `type` arg.
|
| 45 |
+
chatbot = gr.Chatbot(**chatbot_kwargs)
|
| 46 |
+
|
| 47 |
+
with gr.Column():
|
| 48 |
+
with gr.Tab(label="Input"):
|
| 49 |
+
|
| 50 |
+
with gr.Row():
|
| 51 |
+
input_video = gr.Video(sources=["upload"], label="Upload Video")
|
| 52 |
+
input_image = gr.Image(sources=["upload"], type="filepath", label="Upload Image")
|
| 53 |
+
|
| 54 |
+
if len(image_examples):
|
| 55 |
+
gr.Examples(image_examples, inputs=[input_image], label="Example Images")
|
| 56 |
+
if len(video_examples):
|
| 57 |
+
gr.Examples(video_examples, inputs=[input_video], label="Example Videos")
|
| 58 |
+
|
| 59 |
+
input_text = gr.Textbox(label="Input Text", placeholder="Type your message here and press enter to submit")
|
| 60 |
+
|
| 61 |
+
submit_button = gr.Button("Generate")
|
| 62 |
+
|
| 63 |
+
with gr.Tab(label="Configure"):
|
| 64 |
+
with gr.Accordion("Prompt Config", open=True):
|
| 65 |
+
system_prompt = gr.Textbox(
|
| 66 |
+
value=self.default_system_prompt,
|
| 67 |
+
label="System Prompt",
|
| 68 |
+
lines=4,
|
| 69 |
+
placeholder="Optional: system instruction prepended to each request",
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
with gr.Accordion("Generation Config", open=True):
|
| 73 |
+
do_sample = gr.Checkbox(value=True, label="Do Sample")
|
| 74 |
+
temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.1, label="Temperature")
|
| 75 |
+
top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, label="Top P")
|
| 76 |
+
max_new_tokens = gr.Slider(minimum=0, maximum=4096, value=1024, step=1, label="Max New Tokens")
|
| 77 |
+
|
| 78 |
+
with gr.Accordion("Video Config", open=True):
|
| 79 |
+
fps = gr.Slider(minimum=0.0, maximum=10.0, value=1, label="FPS")
|
| 80 |
+
max_frames = gr.Slider(minimum=0, maximum=256, value=180, step=1, label="Max Frames")
|
| 81 |
+
|
| 82 |
+
input_video.change(self._on_video_upload, [chatbot, input_video], [chatbot, input_video])
|
| 83 |
+
input_image.change(self._on_image_upload, [chatbot, input_image], [chatbot, input_image])
|
| 84 |
+
input_text.submit(self._on_text_submit, [chatbot, input_text], [chatbot, input_text])
|
| 85 |
+
submit_button.click(
|
| 86 |
+
self._predict,
|
| 87 |
+
[
|
| 88 |
+
chatbot, input_text, system_prompt, do_sample, temperature, top_p, max_new_tokens,
|
| 89 |
+
fps, max_frames,
|
| 90 |
+
],
|
| 91 |
+
[chatbot, input_text],
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
def _on_video_upload(self, messages, video):
|
| 95 |
+
messages = messages or []
|
| 96 |
+
if video is not None:
|
| 97 |
+
# messages.append({"role": "user", "content": gr.Video(video)})
|
| 98 |
+
messages.append({"role": "user", "content": {"path": video}})
|
| 99 |
+
return messages, None
|
| 100 |
+
|
| 101 |
+
def _on_image_upload(self, messages, image):
|
| 102 |
+
messages = messages or []
|
| 103 |
+
if image is not None:
|
| 104 |
+
# messages.append({"role": "user", "content": gr.Image(image)})
|
| 105 |
+
messages.append({"role": "user", "content": {"path": image}})
|
| 106 |
+
return messages, None
|
| 107 |
+
|
| 108 |
+
def _on_text_submit(self, messages, text):
|
| 109 |
+
messages = messages or []
|
| 110 |
+
messages.append({"role": "user", "content": text})
|
| 111 |
+
return messages, ""
|
| 112 |
+
|
| 113 |
+
def _extract_media_path(self, content):
|
| 114 |
+
if isinstance(content, dict):
|
| 115 |
+
if content.get("type") == "text" and isinstance(content.get("text"), str):
|
| 116 |
+
raise ValueError(f"Text content is not media: {content}")
|
| 117 |
+
media_path = content.get("path")
|
| 118 |
+
if media_path:
|
| 119 |
+
return media_path
|
| 120 |
+
for value in content.values():
|
| 121 |
+
try:
|
| 122 |
+
return self._extract_media_path(value)
|
| 123 |
+
except ValueError:
|
| 124 |
+
continue
|
| 125 |
+
|
| 126 |
+
if isinstance(content, (list, tuple)) and len(content) > 0:
|
| 127 |
+
for item in content:
|
| 128 |
+
try:
|
| 129 |
+
return self._extract_media_path(item)
|
| 130 |
+
except ValueError:
|
| 131 |
+
continue
|
| 132 |
+
|
| 133 |
+
raise ValueError(f"Unsupported media content: {content}")
|
| 134 |
+
|
| 135 |
+
def _extract_text_content(self, content):
|
| 136 |
+
if isinstance(content, str):
|
| 137 |
+
return content
|
| 138 |
+
|
| 139 |
+
if isinstance(content, dict):
|
| 140 |
+
if content.get("type") == "text" and isinstance(content.get("text"), str):
|
| 141 |
+
return content["text"]
|
| 142 |
+
text = content.get("text")
|
| 143 |
+
if isinstance(text, str):
|
| 144 |
+
return text
|
| 145 |
+
|
| 146 |
+
if isinstance(content, (list, tuple)) and len(content) > 0:
|
| 147 |
+
text_parts = []
|
| 148 |
+
for item in content:
|
| 149 |
+
try:
|
| 150 |
+
text_parts.append(self._extract_text_content(item))
|
| 151 |
+
except ValueError:
|
| 152 |
+
continue
|
| 153 |
+
if text_parts:
|
| 154 |
+
return "\n".join(part for part in text_parts if part)
|
| 155 |
+
|
| 156 |
+
raise ValueError(f"Unsupported text content: {content}")
|
| 157 |
+
|
| 158 |
+
def _normalize_user_content(self, content, fps, max_frames):
|
| 159 |
+
if isinstance(content, str):
|
| 160 |
+
return [{"type": "text", "text": content}]
|
| 161 |
+
|
| 162 |
+
if isinstance(content, (list, tuple)):
|
| 163 |
+
normalized_items = []
|
| 164 |
+
for item in content:
|
| 165 |
+
normalized_items.extend(self._normalize_user_content(item, fps, max_frames))
|
| 166 |
+
return normalized_items
|
| 167 |
+
|
| 168 |
+
if isinstance(content, dict):
|
| 169 |
+
try:
|
| 170 |
+
text = self._extract_text_content(content)
|
| 171 |
+
except ValueError:
|
| 172 |
+
text = None
|
| 173 |
+
else:
|
| 174 |
+
return [{"type": "text", "text": text}]
|
| 175 |
+
|
| 176 |
+
media_path = self._extract_media_path(content)
|
| 177 |
+
media_ext = osp.splitext(media_path)[1].lower().lstrip(".")
|
| 178 |
+
if media_ext in self.video_formats:
|
| 179 |
+
return [{"type": "video", "video": {"video_path": media_path, "fps": fps, "max_frames": max_frames}}]
|
| 180 |
+
if media_ext in self.image_formats:
|
| 181 |
+
return [{"type": "image", "image": {"image_path": media_path}}]
|
| 182 |
+
raise ValueError(f"Unsupported media type: {media_path}")
|
| 183 |
+
|
| 184 |
+
raise ValueError(f"Unsupported user content: {content}")
|
| 185 |
+
|
| 186 |
+
def _predict(self, messages, input_text, system_prompt, do_sample, temperature, top_p, max_new_tokens,
|
| 187 |
+
fps, max_frames):
|
| 188 |
+
messages = list(messages or [])
|
| 189 |
+
input_text = input_text or ""
|
| 190 |
+
if input_text and len(input_text) > 0:
|
| 191 |
+
messages.append({"role": "user", "content": input_text})
|
| 192 |
+
new_messages = []
|
| 193 |
+
active_system_prompt = (system_prompt or self.default_system_prompt).strip()
|
| 194 |
+
if active_system_prompt:
|
| 195 |
+
new_messages.append({
|
| 196 |
+
"role": "system",
|
| 197 |
+
"content": [{"type": "text", "text": active_system_prompt}],
|
| 198 |
+
})
|
| 199 |
+
|
| 200 |
+
contents = []
|
| 201 |
+
for message in messages:
|
| 202 |
+
if message["role"] == "assistant":
|
| 203 |
+
if len(contents):
|
| 204 |
+
new_messages.append({"role": "user", "content": contents})
|
| 205 |
+
contents = []
|
| 206 |
+
new_messages.append(message)
|
| 207 |
+
elif message["role"] == "user":
|
| 208 |
+
contents.extend(self._normalize_user_content(message["content"], fps, max_frames))
|
| 209 |
+
|
| 210 |
+
if len(contents):
|
| 211 |
+
new_messages.append({"role": "user", "content": contents})
|
| 212 |
+
|
| 213 |
+
if len(new_messages) == 0 or new_messages[-1]["role"] != "user":
|
| 214 |
+
return messages
|
| 215 |
+
|
| 216 |
+
generation_config = {
|
| 217 |
+
"do_sample": do_sample,
|
| 218 |
+
"temperature": temperature,
|
| 219 |
+
"top_p": top_p,
|
| 220 |
+
"max_new_tokens": max_new_tokens
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
response = self.model_client.submit({"conversation": new_messages, "generation_config": generation_config})
|
| 224 |
+
if isinstance(response, str):
|
| 225 |
+
messages.append({"role": "assistant", "content": response})
|
| 226 |
+
yield messages, ""
|
| 227 |
+
return
|
| 228 |
+
|
| 229 |
+
messages.append({"role": "assistant", "content": ""})
|
| 230 |
+
for token in response:
|
| 231 |
+
messages[-1]['content'] += token
|
| 232 |
+
yield messages, ""
|
| 233 |
+
|
| 234 |
+
def launch(self):
|
| 235 |
+
self.interface.launch(**self.server_kwargs)
|
inference/launch_gradio_demo.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
sys.path.append('.')
|
| 3 |
+
|
| 4 |
+
import argparse
|
| 5 |
+
import os
|
| 6 |
+
import subprocess
|
| 7 |
+
from threading import Thread
|
| 8 |
+
|
| 9 |
+
from inference.interface import PenguinVLQwen3GradioInterface
|
| 10 |
+
from inference.server import PenguinVLQwen3PlainClient
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def launch_gradio_demo(model_path, server_port=16667, interface_port=33666, server_name="0.0.0.0", nproc=1, example_dir="./assets/inputs"):
|
| 14 |
+
server_thread = Thread(
|
| 15 |
+
target=lambda: subprocess.run(
|
| 16 |
+
[
|
| 17 |
+
sys.executable, "-m",
|
| 18 |
+
"inference.server.plain_server",
|
| 19 |
+
"--model-path", model_path,
|
| 20 |
+
"--nproc", str(nproc),
|
| 21 |
+
"--port", str(server_port),
|
| 22 |
+
]
|
| 23 |
+
)
|
| 24 |
+
)
|
| 25 |
+
server_thread.daemon = True
|
| 26 |
+
server_thread.start()
|
| 27 |
+
|
| 28 |
+
if example_dir is not None and not os.path.isdir(example_dir):
|
| 29 |
+
example_dir = None
|
| 30 |
+
|
| 31 |
+
model_client = PenguinVLQwen3PlainClient(port=server_port)
|
| 32 |
+
interface = PenguinVLQwen3GradioInterface(
|
| 33 |
+
model_client,
|
| 34 |
+
example_dir=example_dir,
|
| 35 |
+
server_name=server_name,
|
| 36 |
+
server_port=interface_port,
|
| 37 |
+
)
|
| 38 |
+
interface.launch()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def main():
|
| 42 |
+
parser = argparse.ArgumentParser()
|
| 43 |
+
parser.add_argument("--model-path", "--model_path", type=str, required=True)
|
| 44 |
+
parser.add_argument("--server-port", "--server_port", type=int, default=16667)
|
| 45 |
+
parser.add_argument("--interface-port", "--interface_port", type=int, default=33666)
|
| 46 |
+
parser.add_argument("--server-name", "--server_name", type=str, default="0.0.0.0")
|
| 47 |
+
parser.add_argument("--nproc", type=int, default=1)
|
| 48 |
+
parser.add_argument("--example-dir", "--example_dir", type=str, default="./assets/inputs")
|
| 49 |
+
args = parser.parse_args()
|
| 50 |
+
|
| 51 |
+
launch_gradio_demo(
|
| 52 |
+
model_path=args.model_path,
|
| 53 |
+
server_port=args.server_port,
|
| 54 |
+
interface_port=args.interface_port,
|
| 55 |
+
server_name=args.server_name,
|
| 56 |
+
nproc=args.nproc,
|
| 57 |
+
example_dir=args.example_dir,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
if __name__ == "__main__":
|
| 62 |
+
main()
|
inference/server/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .plain_server import PenguinVLQwen3PlainClient, PenguinVLQwen3PlainServer
|
| 2 |
+
from .direct_client import PenguinVLQwen3DirectClient
|
inference/server/direct_client.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import importlib
|
| 2 |
+
import importlib.util
|
| 3 |
+
import os
|
| 4 |
+
import subprocess
|
| 5 |
+
import sys
|
| 6 |
+
from threading import Lock, Thread
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
from transformers import AutoModelForCausalLM, AutoProcessor, TextIteratorStreamer
|
| 10 |
+
|
| 11 |
+
try:
|
| 12 |
+
import spaces
|
| 13 |
+
except ImportError:
|
| 14 |
+
class _SpacesShim:
|
| 15 |
+
@staticmethod
|
| 16 |
+
def GPU(*args, **kwargs):
|
| 17 |
+
if args and callable(args[0]) and len(args) == 1 and not kwargs:
|
| 18 |
+
return args[0]
|
| 19 |
+
|
| 20 |
+
def decorator(fn):
|
| 21 |
+
return fn
|
| 22 |
+
|
| 23 |
+
return decorator
|
| 24 |
+
|
| 25 |
+
spaces = _SpacesShim()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
_MODEL = None
|
| 29 |
+
_PROCESSOR = None
|
| 30 |
+
_MODEL_PATH = None
|
| 31 |
+
_MODEL_LOCK = Lock()
|
| 32 |
+
_FLASH_ATTN_LOCK = Lock()
|
| 33 |
+
_FLASH_ATTN_PACKAGE = "flash_attn"
|
| 34 |
+
_FLASH_ATTN_REQUIREMENT = os.getenv("FLASH_ATTN_REQUIREMENT", "flash-attn==2.8.3")
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def _get_attn_implementation():
|
| 38 |
+
return os.getenv("ATTN_IMPLEMENTATION", "sdpa")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _get_model_revision():
|
| 42 |
+
return os.getenv("MODEL_REVISION")
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def ensure_flash_attn_installed():
|
| 46 |
+
if importlib.util.find_spec(_FLASH_ATTN_PACKAGE) is not None:
|
| 47 |
+
return
|
| 48 |
+
|
| 49 |
+
with _FLASH_ATTN_LOCK:
|
| 50 |
+
if importlib.util.find_spec(_FLASH_ATTN_PACKAGE) is not None:
|
| 51 |
+
return
|
| 52 |
+
|
| 53 |
+
install_cmd = [
|
| 54 |
+
sys.executable,
|
| 55 |
+
"-m",
|
| 56 |
+
"pip",
|
| 57 |
+
"install",
|
| 58 |
+
_FLASH_ATTN_REQUIREMENT,
|
| 59 |
+
"--no-build-isolation",
|
| 60 |
+
]
|
| 61 |
+
print(f"Installing {_FLASH_ATTN_REQUIREMENT} with --no-build-isolation...")
|
| 62 |
+
subprocess.check_call(install_cmd, env=os.environ.copy())
|
| 63 |
+
importlib.invalidate_caches()
|
| 64 |
+
if importlib.util.find_spec(_FLASH_ATTN_PACKAGE) is None:
|
| 65 |
+
raise RuntimeError(f"Failed to import {_FLASH_ATTN_PACKAGE} after installation.")
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def _ensure_model_loaded(model_path):
|
| 69 |
+
global _MODEL, _PROCESSOR, _MODEL_PATH
|
| 70 |
+
|
| 71 |
+
if _MODEL is not None and _PROCESSOR is not None and _MODEL_PATH == model_path:
|
| 72 |
+
return _MODEL, _PROCESSOR
|
| 73 |
+
|
| 74 |
+
with _MODEL_LOCK:
|
| 75 |
+
if _MODEL is not None and _PROCESSOR is not None and _MODEL_PATH == model_path:
|
| 76 |
+
return _MODEL, _PROCESSOR
|
| 77 |
+
|
| 78 |
+
ensure_flash_attn_installed()
|
| 79 |
+
attn_implementation = _get_attn_implementation()
|
| 80 |
+
revision = _get_model_revision()
|
| 81 |
+
|
| 82 |
+
processor_kwargs = {
|
| 83 |
+
"trust_remote_code": True,
|
| 84 |
+
}
|
| 85 |
+
if revision:
|
| 86 |
+
processor_kwargs["revision"] = revision
|
| 87 |
+
|
| 88 |
+
model_kwargs = {
|
| 89 |
+
"trust_remote_code": True,
|
| 90 |
+
"device_map": {"": "cuda:0"},
|
| 91 |
+
"torch_dtype": torch.bfloat16,
|
| 92 |
+
"attn_implementation": attn_implementation,
|
| 93 |
+
}
|
| 94 |
+
if revision:
|
| 95 |
+
model_kwargs["revision"] = revision
|
| 96 |
+
|
| 97 |
+
_MODEL = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
|
| 98 |
+
_PROCESSOR = AutoProcessor.from_pretrained(model_path, **processor_kwargs)
|
| 99 |
+
_MODEL_PATH = model_path
|
| 100 |
+
return _MODEL, _PROCESSOR
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _estimate_duration(payload):
|
| 104 |
+
generation_config = payload.get("generation_config", {})
|
| 105 |
+
max_new_tokens = int(generation_config.get("max_new_tokens", 512))
|
| 106 |
+
has_video = False
|
| 107 |
+
for message in payload.get("conversation", []):
|
| 108 |
+
for content in message.get("content", []):
|
| 109 |
+
if isinstance(content, dict) and content.get("type") == "video":
|
| 110 |
+
has_video = True
|
| 111 |
+
break
|
| 112 |
+
if has_video:
|
| 113 |
+
break
|
| 114 |
+
|
| 115 |
+
base_duration = 90 if has_video else 60
|
| 116 |
+
token_budget = max_new_tokens // 16
|
| 117 |
+
return min(180, max(base_duration, base_duration + token_budget))
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
@spaces.GPU(duration=_estimate_duration)
|
| 121 |
+
def _run_generation_stream(payload):
|
| 122 |
+
model_path = payload["model_path"]
|
| 123 |
+
model, processor = _ensure_model_loaded(model_path)
|
| 124 |
+
|
| 125 |
+
inputs = processor(
|
| 126 |
+
conversation=payload["conversation"],
|
| 127 |
+
add_system_prompt=True,
|
| 128 |
+
add_generation_prompt=True,
|
| 129 |
+
return_tensors="pt",
|
| 130 |
+
)
|
| 131 |
+
inputs = {k: v.to("cuda:0") if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
|
| 132 |
+
if "pixel_values" in inputs:
|
| 133 |
+
inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
|
| 134 |
+
|
| 135 |
+
generation_kwargs = {
|
| 136 |
+
**inputs,
|
| 137 |
+
**payload.get("generation_config", {}),
|
| 138 |
+
}
|
| 139 |
+
streamer = TextIteratorStreamer(
|
| 140 |
+
processor.tokenizer,
|
| 141 |
+
skip_prompt=True,
|
| 142 |
+
skip_special_tokens=True,
|
| 143 |
+
)
|
| 144 |
+
generation_kwargs["streamer"] = streamer
|
| 145 |
+
|
| 146 |
+
generation_error = {}
|
| 147 |
+
|
| 148 |
+
def _generation_worker():
|
| 149 |
+
try:
|
| 150 |
+
with torch.inference_mode():
|
| 151 |
+
model.generate(**generation_kwargs)
|
| 152 |
+
except Exception as exc:
|
| 153 |
+
generation_error["exc"] = exc
|
| 154 |
+
streamer.on_finalized_text("", stream_end=True)
|
| 155 |
+
|
| 156 |
+
thread = Thread(target=_generation_worker, daemon=True)
|
| 157 |
+
thread.start()
|
| 158 |
+
|
| 159 |
+
for token in streamer:
|
| 160 |
+
yield token
|
| 161 |
+
|
| 162 |
+
if "exc" in generation_error:
|
| 163 |
+
raise generation_error["exc"]
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
class PenguinVLQwen3DirectClient(object):
|
| 167 |
+
|
| 168 |
+
def __init__(self, model_path):
|
| 169 |
+
self.model_path = model_path
|
| 170 |
+
|
| 171 |
+
def submit(self, payload):
|
| 172 |
+
return _run_generation_stream({
|
| 173 |
+
"model_path": self.model_path,
|
| 174 |
+
"conversation": payload["conversation"],
|
| 175 |
+
"generation_config": payload.get("generation_config", {}),
|
| 176 |
+
})
|
inference/server/plain_server.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import random
|
| 3 |
+
import socket
|
| 4 |
+
import time
|
| 5 |
+
import traceback
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
import logging
|
| 9 |
+
import torch
|
| 10 |
+
from transformers import AutoModelForCausalLM, AutoProcessor, TextIteratorStreamer
|
| 11 |
+
from threading import Thread
|
| 12 |
+
from multiprocessing import Process, Queue
|
| 13 |
+
|
| 14 |
+
EOS_FLAG = "<EOS>"
|
| 15 |
+
SEPARATOR = "<SEP>"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def get_logger(name):
|
| 19 |
+
logger = logging.getLogger(name)
|
| 20 |
+
logger.setLevel(logging.INFO)
|
| 21 |
+
handler = logging.StreamHandler()
|
| 22 |
+
formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")
|
| 23 |
+
handler.setFormatter(formatter)
|
| 24 |
+
logger.addHandler(handler)
|
| 25 |
+
return logger
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class Streamer(object):
|
| 29 |
+
|
| 30 |
+
def __init__(self, timeout=None):
|
| 31 |
+
self.timeout = timeout
|
| 32 |
+
self.queue = Queue(maxsize=1024)
|
| 33 |
+
self.stop_signal = EOS_FLAG
|
| 34 |
+
|
| 35 |
+
def put(self, value):
|
| 36 |
+
self.queue.put(value)
|
| 37 |
+
|
| 38 |
+
def __iter__(self):
|
| 39 |
+
return self
|
| 40 |
+
|
| 41 |
+
def __next__(self):
|
| 42 |
+
try:
|
| 43 |
+
value = self.queue.get(timeout=self.timeout)
|
| 44 |
+
except:
|
| 45 |
+
raise StopIteration()
|
| 46 |
+
|
| 47 |
+
if value == self.stop_signal:
|
| 48 |
+
raise StopIteration()
|
| 49 |
+
else:
|
| 50 |
+
return value
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class PenguinVLQwen3PlainClient(object):
|
| 54 |
+
|
| 55 |
+
def __init__(self, host="localhost", port=16666):
|
| 56 |
+
self.host = host
|
| 57 |
+
self.port = port
|
| 58 |
+
|
| 59 |
+
self.input_buffer = Queue(maxsize=1024)
|
| 60 |
+
self.streamers = dict()
|
| 61 |
+
self.logger = get_logger("penguinvl_qwen3.client")
|
| 62 |
+
|
| 63 |
+
client_thread = Thread(target=self._client_worker)
|
| 64 |
+
client_thread.deamon = True
|
| 65 |
+
client_thread.start()
|
| 66 |
+
|
| 67 |
+
def _receive_worker(self, server_socket):
|
| 68 |
+
try:
|
| 69 |
+
while True:
|
| 70 |
+
data = server_socket.recv(8192)
|
| 71 |
+
if not data:
|
| 72 |
+
self.logger.info(f"Connection has been terminated.")
|
| 73 |
+
for streamer in self.streamers.values():
|
| 74 |
+
streamer.put(streamer.stop_signal)
|
| 75 |
+
break
|
| 76 |
+
|
| 77 |
+
for sub_data in data.decode("utf-8").split(SEPARATOR):
|
| 78 |
+
if len(sub_data) == 0:
|
| 79 |
+
continue
|
| 80 |
+
|
| 81 |
+
try:
|
| 82 |
+
sub_data = json.loads(sub_data)
|
| 83 |
+
except:
|
| 84 |
+
self.logger.info(f"Failed to parse data: {sub_data}")
|
| 85 |
+
continue
|
| 86 |
+
|
| 87 |
+
self.logger.info(f"Received: {sub_data['data']}")
|
| 88 |
+
self.streamers[sub_data["id"]].put(sub_data["data"])
|
| 89 |
+
|
| 90 |
+
if sub_data["data"] == EOS_FLAG:
|
| 91 |
+
self.streamers.pop(sub_data["id"])
|
| 92 |
+
|
| 93 |
+
except ConnectionResetError:
|
| 94 |
+
self.logger.info(f"Connection has been terminated.")
|
| 95 |
+
|
| 96 |
+
def _send_worker(self, server_socket):
|
| 97 |
+
while True:
|
| 98 |
+
request_id, conversation = self.input_buffer.get()
|
| 99 |
+
data = json.dumps({"id": request_id, "data": conversation}) + SEPARATOR
|
| 100 |
+
server_socket.sendall(data.encode("utf-8"))
|
| 101 |
+
self.logger.info(f"Sent: {data}")
|
| 102 |
+
|
| 103 |
+
def _client_worker(self):
|
| 104 |
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as server_socket:
|
| 105 |
+
while True:
|
| 106 |
+
try:
|
| 107 |
+
server_socket.connect((self.host, self.port))
|
| 108 |
+
break
|
| 109 |
+
except ConnectionRefusedError:
|
| 110 |
+
self.logger.info("Waiting for the server to start...")
|
| 111 |
+
time.sleep(1)
|
| 112 |
+
continue
|
| 113 |
+
|
| 114 |
+
self.logger.info("Connected to server.")
|
| 115 |
+
receive_thread = Thread(target=self._receive_worker, args=(server_socket,))
|
| 116 |
+
receive_thread.daemon = True
|
| 117 |
+
receive_thread.start()
|
| 118 |
+
|
| 119 |
+
send_thread = Thread(target=self._send_worker, args=(server_socket,))
|
| 120 |
+
send_thread.daemon = True
|
| 121 |
+
send_thread.start()
|
| 122 |
+
|
| 123 |
+
receive_thread.join()
|
| 124 |
+
|
| 125 |
+
def submit(self, conversation):
|
| 126 |
+
request_id = random.randint(0, 4294967295)
|
| 127 |
+
streamer = Streamer()
|
| 128 |
+
self.streamers[request_id] = streamer
|
| 129 |
+
self.input_buffer.put((request_id, conversation))
|
| 130 |
+
return streamer
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
class PenguinVLQwen3PlainServer(object):
|
| 134 |
+
|
| 135 |
+
def __init__(
|
| 136 |
+
self,
|
| 137 |
+
model_path,
|
| 138 |
+
torch_dtype=torch.bfloat16,
|
| 139 |
+
attn_implementation="flash_attention_2",
|
| 140 |
+
num_processes=1,
|
| 141 |
+
buffer_size=2,
|
| 142 |
+
host="localhost",
|
| 143 |
+
port=16666,
|
| 144 |
+
):
|
| 145 |
+
self.model_path = model_path
|
| 146 |
+
self.torch_dtype = torch_dtype
|
| 147 |
+
self.attn_implementation = attn_implementation
|
| 148 |
+
self.num_processes = num_processes
|
| 149 |
+
self.buffer_size = buffer_size
|
| 150 |
+
|
| 151 |
+
self.host = host
|
| 152 |
+
self.port = port
|
| 153 |
+
|
| 154 |
+
def _model_worker(self, input_buffer, output_buffer, device_map, rank):
|
| 155 |
+
logger = get_logger(f"penguinvl_qwen3.server.worker_{rank}")
|
| 156 |
+
logger.info(f"Loading model from {self.model_path}...")
|
| 157 |
+
|
| 158 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 159 |
+
self.model_path,
|
| 160 |
+
trust_remote_code=True,
|
| 161 |
+
torch_dtype=self.torch_dtype,
|
| 162 |
+
attn_implementation=self.attn_implementation,
|
| 163 |
+
device_map=device_map,
|
| 164 |
+
)
|
| 165 |
+
processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True)
|
| 166 |
+
logger.info(f"Successfully loaded model.")
|
| 167 |
+
|
| 168 |
+
while True:
|
| 169 |
+
logger.info("Waiting for input...")
|
| 170 |
+
request_id, data = input_buffer.get()
|
| 171 |
+
try:
|
| 172 |
+
inputs = processor(
|
| 173 |
+
conversation=data["conversation"],
|
| 174 |
+
add_system_prompt=True,
|
| 175 |
+
add_generation_prompt=True,
|
| 176 |
+
return_tensors="pt"
|
| 177 |
+
)
|
| 178 |
+
inputs = {k: v.to(f"cuda:{rank}") if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
|
| 179 |
+
if "pixel_values" in inputs:
|
| 180 |
+
inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
|
| 181 |
+
|
| 182 |
+
streamer = TextIteratorStreamer(processor.tokenizer, skip_prompt=True, skip_special_tokens=True)
|
| 183 |
+
generation_kwargs = {
|
| 184 |
+
**inputs,
|
| 185 |
+
**data["generation_config"],
|
| 186 |
+
"streamer": streamer,
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
| 190 |
+
thread.deamon = True
|
| 191 |
+
thread.start()
|
| 192 |
+
|
| 193 |
+
for token in streamer:
|
| 194 |
+
output_buffer.put((request_id, token))
|
| 195 |
+
output_buffer.put((request_id, EOS_FLAG))
|
| 196 |
+
|
| 197 |
+
except:
|
| 198 |
+
logger.error(f"An error occurred: {traceback.format_exc()}")
|
| 199 |
+
output_buffer.put((request_id, "Server error! Please check the server logs and retry."))
|
| 200 |
+
output_buffer.put((request_id, EOS_FLAG))
|
| 201 |
+
|
| 202 |
+
def _receive_worker(self, logger, input_buffer, client_socket, client_address):
|
| 203 |
+
try:
|
| 204 |
+
while True:
|
| 205 |
+
data = client_socket.recv(8192)
|
| 206 |
+
if not data:
|
| 207 |
+
logger.info(f"Connection from {client_address} has been terminated.")
|
| 208 |
+
break
|
| 209 |
+
|
| 210 |
+
for sub_data in data.decode("utf-8").split(SEPARATOR):
|
| 211 |
+
if len(sub_data) == 0:
|
| 212 |
+
continue
|
| 213 |
+
|
| 214 |
+
try:
|
| 215 |
+
sub_data = json.loads(sub_data)
|
| 216 |
+
except:
|
| 217 |
+
logger.info(f"Failed to parse data: {sub_data}")
|
| 218 |
+
continue
|
| 219 |
+
|
| 220 |
+
logger.info(f"Received from {client_address}: {sub_data}")
|
| 221 |
+
input_buffer.put((sub_data["id"], sub_data["data"]))
|
| 222 |
+
|
| 223 |
+
except ConnectionResetError:
|
| 224 |
+
logger.info(f"Connection from {client_address} has been terminated.")
|
| 225 |
+
|
| 226 |
+
def _send_worker(self, logger, output_buffer, client_socket, client_address):
|
| 227 |
+
try:
|
| 228 |
+
while True:
|
| 229 |
+
request_id, token = output_buffer.get()
|
| 230 |
+
data = json.dumps({"id": request_id, "data": token}) + SEPARATOR
|
| 231 |
+
client_socket.sendall(data.encode("utf-8"))
|
| 232 |
+
|
| 233 |
+
except ConnectionResetError:
|
| 234 |
+
logger.info(f"Connection from {client_address} has been terminated.")
|
| 235 |
+
|
| 236 |
+
def launch(self):
|
| 237 |
+
logger = get_logger(f"penguinvl_qwen3.server.controller")
|
| 238 |
+
|
| 239 |
+
input_buffer = Queue(maxsize=self.num_processes * self.buffer_size)
|
| 240 |
+
output_buffer = Queue(maxsize=self.num_processes * 1024)
|
| 241 |
+
|
| 242 |
+
for i in range(self.num_processes):
|
| 243 |
+
device_map = {"": f"cuda:{i}"}
|
| 244 |
+
process = Process(target=self._model_worker, args=(input_buffer, output_buffer, device_map, i))
|
| 245 |
+
process.start()
|
| 246 |
+
|
| 247 |
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as server_socket:
|
| 248 |
+
server_socket.bind((self.host, self.port))
|
| 249 |
+
server_socket.listen(1)
|
| 250 |
+
logger.info("Waiting for connection...")
|
| 251 |
+
|
| 252 |
+
while True:
|
| 253 |
+
client_socket, client_address = server_socket.accept()
|
| 254 |
+
logger.info(f"Connected to {client_address}.")
|
| 255 |
+
|
| 256 |
+
receive_thread = Thread(target=self._receive_worker, args=(logger, input_buffer, client_socket, client_address))
|
| 257 |
+
receive_thread.deamon = True
|
| 258 |
+
receive_thread.start()
|
| 259 |
+
|
| 260 |
+
send_thread = Thread(target=self._send_worker, args=(logger, output_buffer, client_socket, client_address))
|
| 261 |
+
send_thread.deamon = True
|
| 262 |
+
send_thread.start()
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
if __name__ == "__main__":
|
| 266 |
+
torch.multiprocessing.set_start_method("spawn")
|
| 267 |
+
|
| 268 |
+
parser = argparse.ArgumentParser()
|
| 269 |
+
parser.add_argument("--model-path", "--model_path", type=str, required=True)
|
| 270 |
+
parser.add_argument("--nproc", type=int, default=8)
|
| 271 |
+
parser.add_argument("--port", type=int, default=16666)
|
| 272 |
+
args = parser.parse_args()
|
| 273 |
+
|
| 274 |
+
server = PenguinVLQwen3PlainServer(
|
| 275 |
+
model_path=args.model_path,
|
| 276 |
+
num_processes=args.nproc,
|
| 277 |
+
port=args.port,
|
| 278 |
+
)
|
| 279 |
+
server.launch()
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
build-essential
|
pre-requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Build helpers recommended by the FlashAttention installation guide.
|
| 2 |
+
packaging
|
| 3 |
+
psutil
|
| 4 |
+
ninja
|
| 5 |
+
wheel
|
requirements.txt
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--extra-index-url https://download.pytorch.org/whl/cu124
|
| 2 |
+
|
| 3 |
+
# Base runtime for Transformers inference and the Gradio demo.
|
| 4 |
+
# Training, notebook, and vLLM-specific extras were removed from this file.
|
| 5 |
+
# The previous full list is preserved in requirements.original.txt.
|
| 6 |
+
|
| 7 |
+
# Core model runtime
|
| 8 |
+
torch==2.5.1
|
| 9 |
+
torchvision==0.20.1
|
| 10 |
+
transformers==4.51.3
|
| 11 |
+
tokenizers==0.21.4
|
| 12 |
+
accelerate==1.10.1
|
| 13 |
+
huggingface_hub==0.34.4
|
| 14 |
+
sentencepiece==0.1.99
|
| 15 |
+
timm==1.0.3
|
| 16 |
+
numpy==1.24.4
|
| 17 |
+
Pillow
|
| 18 |
+
einops==0.6.1
|
| 19 |
+
einops-exts==0.0.4
|
| 20 |
+
|
| 21 |
+
# Image and video processing
|
| 22 |
+
decord==0.6.0
|
| 23 |
+
imageio==2.34.0
|
| 24 |
+
imageio-ffmpeg==0.4.9
|
| 25 |
+
opencv-python-headless==4.6.0.66
|
| 26 |
+
ffmpeg-python
|
| 27 |
+
requests
|
| 28 |
+
|
| 29 |
+
# UI
|
| 30 |
+
gradio>=5.44.1,<7
|
| 31 |
+
|
| 32 |
+
# FlashAttention is installed separately with:
|
| 33 |
+
# pip install flash-attn==2.8.3 --no-build-isolation
|
| 34 |
+
# This cannot be expressed in a standard Gradio Space requirements install step.
|
| 35 |
+
|
| 36 |
+
# Optional extras
|
| 37 |
+
# vllm==0.11.0
|