Upload folder using huggingface_hub
Browse files- .gitattributes +10 -0
- CASES.md +0 -0
- README.md +184 -3
- README_zh.md +177 -0
- image/0.jpg +0 -0
- image/15.jpg +0 -0
- image/15_3983.jpg +0 -0
- image/224.jpg +0 -0
- image/224_8978.jpg +0 -0
- image/4356.jpg +3 -0
- image/CHEM-059.png +3 -0
- image/CHEM-059_1939.png +3 -0
- image/CHEM-081.jpg +0 -0
- image/CHEM-083.jpg +0 -0
- image/HRbench_4k_231.jpg +3 -0
- image/HRbench_4k_231_01c150b9.jpg +3 -0
- image/HRbench_4k_231_5e355368.jpg +3 -0
- image/P2504_0017.png +3 -0
- image/P2504_0017_6934c19b.png +3 -0
- image/mechanics01.png +0 -0
- image/mechanics02.png +0 -0
- image/meterials01.png +0 -0
- image/meterials02.png +0 -0
- image/quantum_1-1023.png +0 -0
- image/quantum_1-1023_5890.png +0 -0
- image/s1-vl-32b-benchmark.png +3 -0
- image/s1-vl-32b-twi.png +3 -0
- image/val_1691.png +0 -0
- image/val_1691_4856.png +0 -0
- twi_server.py +237 -0
.gitattributes
CHANGED
|
@@ -34,3 +34,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
image/4356.jpg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
image/CHEM-059.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
image/CHEM-059_1939.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
image/HRbench_4k_231.jpg filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
image/HRbench_4k_231_01c150b9.jpg filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
image/HRbench_4k_231_5e355368.jpg filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
image/P2504_0017.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
image/P2504_0017_6934c19b.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
image/s1-vl-32b-benchmark.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
image/s1-vl-32b-twi.png filter=lfs diff=lfs merge=lfs -text
|
CASES.md
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
README.md
CHANGED
|
@@ -1,3 +1,184 @@
|
|
| 1 |
-
--
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# S1-VL-32B: Scientific Multimodal Reasoning Model
|
| 2 |
+
|
| 3 |
+
[中文版](./README_zh.md) | [English](./README.md)
|
| 4 |
+
|
| 5 |
+
## 🔬 Introduction
|
| 6 |
+
|
| 7 |
+
**S1-VL-32B** is a multimodal large language model for scientific domains, developed by the ScienceOne team at the Chinese Academy of Sciences. It natively supports two reasoning paradigms — **Multimodal Reasoning** and **Thinking with Images** — and achieves state-of-the-art performance across multiple mainstream scientific multimodal evaluation benchmarks.
|
| 8 |
+
|
| 9 |
+
- **Multimodal Reasoning Mode**: Chain-of-thought-based multimodal scientific reasoning, designed for the analysis and solving of complex, multi-step problems.
|
| 10 |
+
- **Thinking with Images Mode**: Enables the model to actively invoke code tools during the reasoning process to perform image operations — including cropping, zooming, image enhancement, bounding box annotation, and keypoint marking — before generating responses.
|
| 11 |
+
|
| 12 |
+
We have established a **cross-disciplinary data processing pipeline** that conducts multi-dimensional utility evaluation and filtering of visual reasoning trajectories to ensure the quality of training data. A **multi-stage post-training procedure** is employed to progressively unlock the scientific reasoning capabilities of S1-VL-32B:
|
| 13 |
+
|
| 14 |
+
- **Stage 1**: Large-scale multimodal instruction data spanning multiple disciplines — including **mathematics, physics, chemistry, astronomy, earth sciences, and biology** — is used for mixed training to enhance the model's scientific visual understanding and logical reasoning abilities, laying a solid foundation for academic figure Q&A, medical image analysis, chemical structure recognition, and related tasks.
|
| 15 |
+
- **Stage 2**: The **Thinking with Images** reasoning paradigm is introduced. Through high-quality **scientific reasoning data annealing**, the model acquires the ability to perform **image operations via code** during inference. This approach yields particularly outstanding performance in scenarios requiring fine-grained image analysis, with notable strengths in interpreting dense scientific charts, high-resolution remote sensing imagery, microscopic images, and complex visual scenes such as astronomical observation data.
|
| 16 |
+
|
| 17 |
+
## 📂 Model Weights
|
| 18 |
+
|
| 19 |
+
| Model | Parameters | HuggingFace | ModelScope |
|
| 20 |
+
|-------|-----------|-------------|------------|
|
| 21 |
+
| S1-VL-32B | 32B | 🤗 [Download](https://huggingface.co/ScienceOne-AI/S1-VL-32B) | 🤖 [Download](https://modelscope.cn/models/ScienceOne-AI/S1-VL-32B) |
|
| 22 |
+
|
| 23 |
+
## 🏆 Evaluation Results
|
| 24 |
+
|
| 25 |
+
The evaluation covers **2 dimensions** and **13 benchmarks**. The **Scientific Multimodal Reasoning** dimension includes MMMU, SFE, MathVision, Physics, ScienceOlympiad, VRSBench-MINI, GMAI-MMBench, and Galaxy-10-DECaLS, spanning mathematics, physics, medicine, remote sensing, astronomy, and other professional fields. The **Image Manipulation Reasoning** dimension includes HRBench-4K, HRBench-8K, MME-RealWorld-CN, MME-RealWorld-Lite, and V*, focusing on high-resolution image understanding and real-world visual reasoning.
|
| 26 |
+
|
| 27 |
+
<div align="center">
|
| 28 |
+
<img src="./image/s1-vl-32b-benchmark.png"/>
|
| 29 |
+
</div>
|
| 30 |
+
|
| 31 |
+
S1-VL-32B demonstrates outstanding overall competitiveness across the aforementioned evaluations. In **scientific multimodal reasoning** tasks, the model achieves significant advantages on multiple authoritative benchmarks — including MMMU, MathVision, and VRSBench-MINI — surpassing its base model Qwen3-VL-32B in overall performance, while remaining highly competitive against open-source models with substantially larger parameter scales (e.g., Qwen3-VL-235B, Intern-S1) as well as closed-source flagship models (e.g., Gemini 2.5 Pro, GPT-5). In **image operation reasoning** tasks, S1-VL-32B ranks **first across all five benchmark evaluations**, comprehensively outperforming models of comparable and larger scales, while also surpassing dedicated "Thinking with Images" models such as Thyme-VL and Skywork-R1V4. These results fully validate its ability to achieve efficient, high-quality multimodal reasoning at the 32B parameter scale.
|
| 32 |
+
|
| 33 |
+
## 🧠 Case Study
|
| 34 |
+
|
| 35 |
+
The following presents reasoning examples of S1-VL-32B operating in **Thinking with Images** mode. When processing a low-resolution cervical CT image, S1-VL-32B proactively invokes code tools during its reasoning process to perform **cropping and magnification** on the region of interest. By obtaining a clearer local image, the model then combines the enhanced visual information with its internal knowledge to complete the reasoning.
|
| 36 |
+
|
| 37 |
+
<div align="center">
|
| 38 |
+
<img src="./image/s1-vl-32b-twi.png"/>
|
| 39 |
+
</div>
|
| 40 |
+
|
| 41 |
+
📁 More cases are available in [CASES.md](./CASES.md).
|
| 42 |
+
|
| 43 |
+
## 🚀 Quick Start
|
| 44 |
+
|
| 45 |
+
### 1. Install Dependencies
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
# Requires vLLM >= 0.11.0
|
| 49 |
+
pip install -U vllm
|
| 50 |
+
pip install qwen-vl-utils==0.0.14
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
### 2. Start the vLLM Service
|
| 54 |
+
|
| 55 |
+
```bash
|
| 56 |
+
vllm serve ScienceOne-AI/S1-VL-32B \
|
| 57 |
+
--tensor-parallel-size 4 \
|
| 58 |
+
--max-model-len 32768 \
|
| 59 |
+
--limit-mm-per-prompt image=15 \
|
| 60 |
+
--reasoning-parser deepseek_r1 \
|
| 61 |
+
--enable-prefix-caching \
|
| 62 |
+
--gpu-memory-utilization 0.95 \
|
| 63 |
+
--port 9200
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
### 3. Multimodal Reasoning Mode
|
| 67 |
+
|
| 68 |
+
```python
|
| 69 |
+
from openai import OpenAI
|
| 70 |
+
import base64
|
| 71 |
+
|
| 72 |
+
client = OpenAI(api_key="EMPTY", base_url="http://localhost:9200/v1")
|
| 73 |
+
|
| 74 |
+
with open("path/to/your/image.png", "rb") as f:
|
| 75 |
+
image_data = base64.b64encode(f.read()).decode("utf-8")
|
| 76 |
+
|
| 77 |
+
response = client.chat.completions.create(
|
| 78 |
+
model="ScienceOne-AI/S1-VL-32B",
|
| 79 |
+
messages=[
|
| 80 |
+
{
|
| 81 |
+
"role": "user",
|
| 82 |
+
"content": [
|
| 83 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}},
|
| 84 |
+
{"type": "text", "text": "Please describe the physical phenomenon shown in the image and derive the relevant equations."},
|
| 85 |
+
],
|
| 86 |
+
}
|
| 87 |
+
],
|
| 88 |
+
temperature=0.6,
|
| 89 |
+
top_p=0.95,
|
| 90 |
+
max_tokens=16384,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# The reasoning process is in the reasoning_content field
|
| 94 |
+
print("Thinking process:\n", response.choices[0].message.reasoning_content)
|
| 95 |
+
print("\nFinal answer:\n", response.choices[0].message.content)
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
### 4. Thinking with Images Mode
|
| 99 |
+
|
| 100 |
+
Thinking with Images mode requires deploying a **code sandbox** to support the model invoking code tools during reasoning for image operations (cropping, zooming, enhancement, annotation, etc.).
|
| 101 |
+
|
| 102 |
+
#### Step 1: Deploy the Code Sandbox
|
| 103 |
+
|
| 104 |
+
We recommend deploying the AIO Sandbox with Docker:
|
| 105 |
+
|
| 106 |
+
```bash
|
| 107 |
+
git clone https://github.com/agent-infra/sandbox
|
| 108 |
+
cd sandbox
|
| 109 |
+
# Mount the host image directory into the container
|
| 110 |
+
docker run -d \
|
| 111 |
+
--name twi-sandbox \
|
| 112 |
+
-p 18081:18081 \
|
| 113 |
+
-v /data/images:/mnt/data/images \ # host path → sandbox path
|
| 114 |
+
sandbox:latest
|
| 115 |
+
```
|
| 116 |
+
The mount path must match the path configuration in the FastAPI service.
|
| 117 |
+
|
| 118 |
+
#### Step 2: Start the Thinking with Images FastAPI Service
|
| 119 |
+
|
| 120 |
+
Download [twi_server.py](twi_server.py) and update the path configuration at the top of the file:
|
| 121 |
+
|
| 122 |
+
```python
|
| 123 |
+
CHAT_API = "http://localhost:9200/v1/chat/completions" # vLLM address
|
| 124 |
+
JUPYTER_API = "http://localhost:18081/v1/jupyter" # Sandbox address
|
| 125 |
+
HOST_IMG_DIR = "/data/images" # ← Host image directory (must match docker -v mount)
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
Start the service:
|
| 129 |
+
|
| 130 |
+
```bash
|
| 131 |
+
pip install fastapi uvicorn httpx pillow
|
| 132 |
+
python twi_server.py # Listens on port 10044
|
| 133 |
+
```
|
| 134 |
+
|
| 135 |
+
#### Step 3: Call the Thinking with Images Endpoint
|
| 136 |
+
|
| 137 |
+
```python
|
| 138 |
+
import httpx
|
| 139 |
+
import base64
|
| 140 |
+
|
| 141 |
+
with open("path/to/your/image.png", "rb") as f:
|
| 142 |
+
image_b64 = base64.b64encode(f.read()).decode("utf-8")
|
| 143 |
+
|
| 144 |
+
messages = [
|
| 145 |
+
{"type": "text", "text": "Please carefully analyze this scientific image."},
|
| 146 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
|
| 147 |
+
]
|
| 148 |
+
|
| 149 |
+
response = httpx.post(
|
| 150 |
+
"http://localhost:10044/process",
|
| 151 |
+
json={
|
| 152 |
+
"messages": messages,
|
| 153 |
+
"image_path_list": ["/data/images/your_image.png"], # Absolute host path
|
| 154 |
+
},
|
| 155 |
+
timeout=300,
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
result = response.json()
|
| 159 |
+
|
| 160 |
+
# The final answer is the last message with role="assistant"
|
| 161 |
+
final = [m for m in result["messages"] if m["role"] == "assistant"][-1]
|
| 162 |
+
print(final["content"])
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
## 📄 Citation
|
| 166 |
+
|
| 167 |
+
If you use S1-VL-32B in your research, please cite (the corresponding paper is coming soon):
|
| 168 |
+
|
| 169 |
+
```latex
|
| 170 |
+
@misc{s1vl2026,
|
| 171 |
+
title = {S1-VL-32B: Scientific Multimodal Reasoning Model},
|
| 172 |
+
author = {ScienceOne Team},
|
| 173 |
+
year = {2026},
|
| 174 |
+
howpublished = {\url{https://huggingface.co/ScienceOne-AI/S1-VL-32B}}
|
| 175 |
+
}
|
| 176 |
+
```
|
| 177 |
+
|
| 178 |
+
## 📜 License
|
| 179 |
+
|
| 180 |
+
This project is released under the Apache 2.0 License.
|
| 181 |
+
|
| 182 |
+
## 🙏 Acknowledgements
|
| 183 |
+
|
| 184 |
+
We thank the open-source communities and pioneering works of [Qwen3-VL](https://modelscope.cn/collections/Qwen3-VL-5c7a94c8cb144b) and [AIO Sandbox](https://github.com/agent-infra/sandbox) for laying the foundation for the scientific multimodal reasoning research behind S1-VL-32B.
|
README_zh.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# S1-VL-32B:科学多模态推理模型
|
| 2 |
+
|
| 3 |
+
[中文版](./README_zh.md) | [English](./README.md)
|
| 4 |
+
|
| 5 |
+
## 🔬 模型简介
|
| 6 |
+
**S1-VL-32B** 是由中国科学院 “磐石 · 科学基础大模型” ScienceOne 团队研发的面向科学领域的多模态大语言模型,原生支持 **Multimodal Reasoning(多模态推理)** 与 **Thinking with Images(图像思考)** 两种推理范式,在多项主流科学多模态评测基准上达到当前最优水平。
|
| 7 |
+
- **Multimodal Reasoning 模式**:基于思维链的多模态科学推理,适用于复杂多步问题的分析与求解。
|
| 8 |
+
- **Thinking with Images 模式**:允许模型在推理过程中在思考过程中主动调用代码工具进行图像操作(包括裁剪、放缩、图像增强、画框标注、描点标记等)再生成回答。
|
| 9 |
+
|
| 10 |
+
我们建立**跨学科体系的数据处理管道**对视觉推理轨迹进行多维度效用评估与筛选,确保训练推理轨迹的质量;并采用多阶段后训练流程逐步解锁S1-VL-32B 模型的科学推理能力:
|
| 11 |
+
- 首先,基于涵盖**数理化天地生**等多学科的大规模多模态指令数据进行混合训练,提升模型科学视觉理解和逻辑推理能力,使模型在学术图像问答、医学影像分析、化学结构识别等方面奠定坚实基础;
|
| 12 |
+
- 然后,引入 **Thinking with Images** 推理范式,通过高质量**科学推理数据退火**,使模型具备在推理过程中通过代码进行**图像操作**的能力,在需要精细化图像分析的场景中表现尤为突出,尤其擅长解读密集科学图表、高分辨率遥感图像、显微图像及天文观测数据等复杂视觉场景。
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
## 📂 模型权重
|
| 16 |
+
|
| 17 |
+
| 模型名称 | 参数量 | HuggingFace | ModelScope |
|
| 18 |
+
|--------|------|-------------|------------|
|
| 19 |
+
| S1-VL-32B | 32B | 🤗 [下载](https://huggingface.co/ScienceOne-AI/S1-VL-32B) | 🤖 [下载](https://modelscope.cn/models/ScienceOne-AI/S1-VL-32B) |
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
## 🏆 评测结果
|
| 23 |
+
|
| 24 |
+
本次评测共涵盖 **2大维度**、**13个基准**。**科学多模态推理**方向涵盖 MMMU、SFE、MathVision、Physics、ScienceOlympiad、VRSBench-MINI、GMAI-MMBench 和 Galaxy-10-DECaLS,覆盖数学、物理、医学、遥感、天文等多个专业领域;**图像操作推理**方向则包含 HRBench-4K、HRBench-8K、MME-RealWorld-CN、MME-RealWorld-Lite 和 V*,重点考察模型在高分辨率图像理解与真实场景视觉推理中的表现。
|
| 25 |
+
|
| 26 |
+
<div align="center">
|
| 27 |
+
<img src="./image/s1-vl-32b-benchmark.png"/>
|
| 28 |
+
</div>
|
| 29 |
+
|
| 30 |
+
S1-VL-32B 在上述评测中展现出突出的综合竞争力。在**科学多模态推理**任务中,模型在 MMMU、MathVision、VRSBench-MINI 等多个权威基准上优势显著,整体性能超越基座模型 Qwen3-VL-32B,并与更大参数规模的开源模型(如 Qwen3-VL-235B、Intern-S1)以及闭源旗舰模型(如 Gemini 2.5 Pro、GPT-5)保持相当的竞争力;在**图像操作推理**任务中,S1-VL-32B 在全部五项基准评测中均位列第一,全面领先于同规模及更大规模模型,同时优于 Thyme-VL、Skywork-R1V4 等 “Thinking with Images” 专有模型,充分验证了其在 32B 参数规模下实现高效且高质量多模态推理的能力。
|
| 31 |
+
|
| 32 |
+
## 🧠 案例展示
|
| 33 |
+
|
| 34 |
+
以下展示 S1-VL-32B 在 **Thinking with Images** 模式下的推理案例。S1-VL-32B在处理一张低分辨率的颈部CT图像的思考过程中主动调用代码工具,对目标区域进行**裁剪与放大**,获取更清晰的局部图像后,再结合模型内部知识完成推理。
|
| 35 |
+
|
| 36 |
+
<div align="center">
|
| 37 |
+
<img src="./image/s1-vl-32b-twi.png"/>
|
| 38 |
+
</div>
|
| 39 |
+
|
| 40 |
+
📁 更多案例详见 [CASES.md](./CASES.md) 文件。
|
| 41 |
+
|
| 42 |
+
## 🚀 快速开始
|
| 43 |
+
### 1. 安装依赖
|
| 44 |
+
|
| 45 |
+
```bash
|
| 46 |
+
# 需要 vLLM >= 0.11.0
|
| 47 |
+
pip install -U vllm
|
| 48 |
+
pip install qwen-vl-utils==0.0.14
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
### 2. 启动 vLLM 服务
|
| 52 |
+
```bash
|
| 53 |
+
vllm serve ScienceOne-AI/S1-VL-32B \
|
| 54 |
+
--tensor-parallel-size 4 \
|
| 55 |
+
--max-model-len 32768 \
|
| 56 |
+
--limit-mm-per-prompt image=15 \
|
| 57 |
+
--reasoning-parser deepseek_r1 \
|
| 58 |
+
--enable-prefix-caching \
|
| 59 |
+
--gpu-memory-utilization 0.95 \
|
| 60 |
+
--port 9200
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
### 3. Multimodal Reasoning 模式
|
| 64 |
+
```python
|
| 65 |
+
from openai import OpenAI
|
| 66 |
+
import base64
|
| 67 |
+
|
| 68 |
+
client = OpenAI(api_key="EMPTY", base_url="http://localhost:9200/v1")
|
| 69 |
+
|
| 70 |
+
with open("path/to/your/image.png", "rb") as f:
|
| 71 |
+
image_data = base64.b64encode(f.read()).decode("utf-8")
|
| 72 |
+
|
| 73 |
+
response = client.chat.completions.create(
|
| 74 |
+
model="ScienceOne-AI/S1-VL-32B",
|
| 75 |
+
messages=[
|
| 76 |
+
{
|
| 77 |
+
"role": "user",
|
| 78 |
+
"content": [
|
| 79 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_data}"}},
|
| 80 |
+
{"type": "text", "text": "请描述图中所示的物理现象并推导相关方程。"},
|
| 81 |
+
],
|
| 82 |
+
}
|
| 83 |
+
],
|
| 84 |
+
temperature=0.2,
|
| 85 |
+
max_tokens=16384,
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
# 推理过程在 reasoning_content 字段中
|
| 89 |
+
print("思考过程:\n", response.choices[0].message.reasoning_content)
|
| 90 |
+
print("\n最终答案:\n", response.choices[0].message.content)
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
### 4. Thinking with Images 模式
|
| 94 |
+
|
| 95 |
+
Thinking with Images 模式需要部署**代码沙箱**,以支持模型在推理过程中调用代码工具实现图像操作(裁剪、放缩、增强、标注等)。
|
| 96 |
+
|
| 97 |
+
#### Step 1:部署代码沙箱
|
| 98 |
+
|
| 99 |
+
推荐使用 Docker 部署 AIO Sandbox:
|
| 100 |
+
|
| 101 |
+
```bash
|
| 102 |
+
git clone https://github.com/agent-infra/sandbox
|
| 103 |
+
cd sandbox
|
| 104 |
+
# 将宿主机图片目录挂载到容器内
|
| 105 |
+
docker run -d \
|
| 106 |
+
--name twi-sandbox \
|
| 107 |
+
-p 18081:18081 \
|
| 108 |
+
-v /data/images:/mnt/data/images \ # 宿主机路径 → 沙箱内路径
|
| 109 |
+
sandbox:latest
|
| 110 |
+
```
|
| 111 |
+
挂载路径需与 FastAPI 服务中的路径配置保持一致。
|
| 112 |
+
|
| 113 |
+
#### Step 2:启动 Thinking with Images FastAPI 服务
|
| 114 |
+
|
| 115 |
+
下载 [twi_server.py](twi_server.py),修改文件顶部的路径配置:
|
| 116 |
+
|
| 117 |
+
```python
|
| 118 |
+
CHAT_API = "http://localhost:9200/v1/chat/completions" # vLLM 地址
|
| 119 |
+
JUPYTER_API = "http://localhost:18081/v1/jupyter" # 沙箱地址
|
| 120 |
+
HOST_IMG_DIR = "/data/images" # ← 宿主机图片目录(需与 docker -v 挂载一致)
|
| 121 |
+
```
|
| 122 |
+
|
| 123 |
+
启动服务:
|
| 124 |
+
|
| 125 |
+
```bash
|
| 126 |
+
pip install fastapi uvicorn httpx pillow
|
| 127 |
+
python twi_server.py # 监听 10044 端口
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
#### Step 3:调用 Thinking with Images 接口
|
| 131 |
+
|
| 132 |
+
```python
|
| 133 |
+
import httpx
|
| 134 |
+
import base64
|
| 135 |
+
|
| 136 |
+
with open("path/to/your/image.png", "rb") as f:
|
| 137 |
+
image_b64 = base64.b64encode(f.read()).decode("utf-8")
|
| 138 |
+
|
| 139 |
+
messages = [
|
| 140 |
+
{"type": "text", "text": "请仔细分析这张科学图像。"},
|
| 141 |
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{image_b64}"}},
|
| 142 |
+
]
|
| 143 |
+
|
| 144 |
+
response = httpx.post(
|
| 145 |
+
"http://localhost:10044/process",
|
| 146 |
+
json={
|
| 147 |
+
"messages": messages,
|
| 148 |
+
"image_path_list": ["/data/images/your_image.png"], # 宿主机绝对路径
|
| 149 |
+
},
|
| 150 |
+
timeout=300,
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
result = response.json()
|
| 154 |
+
|
| 155 |
+
# 最终答案为最后一条 role="assistant" 的消息
|
| 156 |
+
final = [m for m in result["messages"] if m["role"] == "assistant"][-1]
|
| 157 |
+
print(final["content"])
|
| 158 |
+
```
|
| 159 |
+
|
| 160 |
+
## 📄 引用
|
| 161 |
+
|
| 162 |
+
如果您在研究中使用了 S1-VL-32B,欢迎引用(相关论文即将发布,敬请期待):
|
| 163 |
+
```latex
|
| 164 |
+
@misc{s1vl2026,
|
| 165 |
+
title = {S1-VL-32B: Scientific Multimodal Reasoning Model},
|
| 166 |
+
author = {ScienceOne Team},
|
| 167 |
+
year = {2026},
|
| 168 |
+
howpublished = {\url{https://huggingface.co/ScienceOne-AI/S1-VL-32B}}
|
| 169 |
+
}
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
|
| 173 |
+
## 📜 开源协议
|
| 174 |
+
本项目基于 Apache 2.0 LICENSE 开源发布。
|
| 175 |
+
|
| 176 |
+
## 🙏 致谢
|
| 177 |
+
感谢 [Qwen3-VL](https://modelscope.cn/collections/Qwen3-VL-5c7a94c8cb144b)、[AIO Sandbox](https://github.com/agent-infra/sandbox) 等开源社区和先驱工作为 S1-VL-32B 科学多模态推理的研究工作奠定的基础。
|
image/0.jpg
ADDED
|
image/15.jpg
ADDED
|
image/15_3983.jpg
ADDED
|
image/224.jpg
ADDED
|
image/224_8978.jpg
ADDED
|
image/4356.jpg
ADDED
|
Git LFS Details
|
image/CHEM-059.png
ADDED
|
Git LFS Details
|
image/CHEM-059_1939.png
ADDED
|
Git LFS Details
|
image/CHEM-081.jpg
ADDED
|
image/CHEM-083.jpg
ADDED
|
image/HRbench_4k_231.jpg
ADDED
|
Git LFS Details
|
image/HRbench_4k_231_01c150b9.jpg
ADDED
|
Git LFS Details
|
image/HRbench_4k_231_5e355368.jpg
ADDED
|
Git LFS Details
|
image/P2504_0017.png
ADDED
|
Git LFS Details
|
image/P2504_0017_6934c19b.png
ADDED
|
Git LFS Details
|
image/mechanics01.png
ADDED
|
image/mechanics02.png
ADDED
|
image/meterials01.png
ADDED
|
image/meterials02.png
ADDED
|
image/quantum_1-1023.png
ADDED
|
image/quantum_1-1023_5890.png
ADDED
|
image/s1-vl-32b-benchmark.png
ADDED
|
Git LFS Details
|
image/s1-vl-32b-twi.png
ADDED
|
Git LFS Details
|
image/val_1691.png
ADDED
|
image/val_1691_4856.png
ADDED
|
twi_server.py
ADDED
|
@@ -0,0 +1,237 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import base64
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import httpx
|
| 6 |
+
from PIL import Image
|
| 7 |
+
from fastapi import FastAPI, HTTPException
|
| 8 |
+
from pydantic import BaseModel
|
| 9 |
+
from typing import List, Dict, Any
|
| 10 |
+
|
| 11 |
+
app = FastAPI(title="Thinking with Images API")
|
| 12 |
+
|
| 13 |
+
# ── Configuration ──
|
| 14 |
+
MODEL_NAME = "model_name"
|
| 15 |
+
CHAT_API = "http://localhost:9200/v1/chat/completions"
|
| 16 |
+
JUPYTER_API = "http://localhost:18081/v1/jupyter"
|
| 17 |
+
|
| 18 |
+
# Sandbox internal paths <-> host machine real paths (docker volume mapping)
|
| 19 |
+
SANDBOX_IMG_DIR = "/mnt/data"
|
| 20 |
+
HOST_IMG_DIR = "/data" # ← update to match actual mount path
|
| 21 |
+
SANDBOX_TMP_DIR = "/mnt/data/images/temp" # fixed path — do not change
|
| 22 |
+
HOST_TMP_DIR = "/data/thinking_with_images/temp"
|
| 23 |
+
|
| 24 |
+
SYSTEM_PROMPT = '''
|
| 25 |
+
You are a helpful assistant.
|
| 26 |
+
|
| 27 |
+
# Tools
|
| 28 |
+
You may call one or more functions to assist with the user query.
|
| 29 |
+
You are provided with function signatures within <tools></tools> XML tags:
|
| 30 |
+
|
| 31 |
+
<tools>
|
| 32 |
+
{"type": "function", "function": {"name": "python", "description": "Use this tool to execute Python code in your chain of thought.\n\nWhen you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 60.0 seconds. The drive at '/mnt/data/images/temp' can be used to save the temporary image files. Internet access for this session is disabled. Do not make external web requests or API calls as they will fail.\n\nReasoning & Image Manipulation & Drawing Auxiliary Graphics (Optional but Encouraged):\n- You have the capability to write executable Python code to perform image manipulations (e.g., cropping to a Region of Interest (ROI), resizing, rotation, adjusting contrast) or perform calculation for better reasoning.\n- You have the capability to write Python code to add auxiliary graphics (such as segments, circles, rectangles, labels, etc.) to the image, to help illustrate your reasoning process.\n- The code will be executed in a secure sandbox, and its output will be provided back to you for further analysis.\n- At the end of the code, print the path of the processed image (processed_path) or the relevant result for further processing within the sandbox environment.", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": "The Python code to execute"}}}, "required": ["code"]}}
|
| 33 |
+
</tools>
|
| 34 |
+
|
| 35 |
+
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
| 36 |
+
<tool_call>
|
| 37 |
+
{"name": <function-name>, "arguments": <args-json-object>}
|
| 38 |
+
</tool_call>
|
| 39 |
+
'''
|
| 40 |
+
|
| 41 |
+
MAX_TURNS = 8
|
| 42 |
+
|
| 43 |
+
class RequestModel(BaseModel):
|
| 44 |
+
messages: List[Dict[str, Any]]
|
| 45 |
+
image_path_list: List[str]
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# ── Utility Functions ──
|
| 49 |
+
|
| 50 |
+
def get_img_size(path: str) -> tuple[int, int]:
|
| 51 |
+
with Image.open(path) as img:
|
| 52 |
+
return img.size # (width, height)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def encode_image(path: str) -> str:
|
| 56 |
+
return base64.b64encode(Path(path).read_bytes()).decode()
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def to_sandbox_path(host_path: str) -> str:
|
| 60 |
+
"""Convert a host machine path to the corresponding sandbox path."""
|
| 61 |
+
return host_path.replace(HOST_IMG_DIR, SANDBOX_IMG_DIR)
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def to_host_path(sandbox_path: str) -> str:
|
| 65 |
+
"""Convert a sandbox path to the corresponding host machine path."""
|
| 66 |
+
return sandbox_path.replace(SANDBOX_TMP_DIR + "/", HOST_TMP_DIR + "/")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def build_user_content(
|
| 70 |
+
messages: List[Dict[str, Any]],
|
| 71 |
+
image_path_list: List[str],
|
| 72 |
+
) -> List[Dict[str, Any]]:
|
| 73 |
+
"""Inject image metadata (path, dimensions) after each image_url item in the message content."""
|
| 74 |
+
content, k = [], 0
|
| 75 |
+
for item in messages:
|
| 76 |
+
content.append(item)
|
| 77 |
+
if item["type"] == "image_url":
|
| 78 |
+
if k >= len(image_path_list):
|
| 79 |
+
raise ValueError(
|
| 80 |
+
f"image_path_list too short: need image #{k+1} but only {len(image_path_list)} provided"
|
| 81 |
+
)
|
| 82 |
+
w, h = get_img_size(image_path_list[k])
|
| 83 |
+
sandbox_path = to_sandbox_path(image_path_list[k])
|
| 84 |
+
content.append({
|
| 85 |
+
"type": "text",
|
| 86 |
+
"text": f"\nimage path: {sandbox_path}\nimage width: {w}\nimage height: {h}\n\n",
|
| 87 |
+
})
|
| 88 |
+
k += 1
|
| 89 |
+
return content
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def build_initial_payload(user_content: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 93 |
+
return {
|
| 94 |
+
"model": MODEL_NAME,
|
| 95 |
+
"messages": [
|
| 96 |
+
{"role": "system", "content": SYSTEM_PROMPT},
|
| 97 |
+
{"role": "user", "content": user_content},
|
| 98 |
+
],
|
| 99 |
+
"skip_special_tokens": False,
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def messages_to_text(payload_messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 104 |
+
"""Flatten multimodal content in the final messages to plain text (for response/storage)."""
|
| 105 |
+
result = []
|
| 106 |
+
for msg in payload_messages:
|
| 107 |
+
if msg["role"] == "user" and isinstance(msg["content"], list):
|
| 108 |
+
text = ""
|
| 109 |
+
for item in msg["content"]:
|
| 110 |
+
if item["type"] == "image_url":
|
| 111 |
+
text += "<image>"
|
| 112 |
+
elif item["type"] == "text":
|
| 113 |
+
text += item["text"]
|
| 114 |
+
result.append({**msg, "content": text})
|
| 115 |
+
else:
|
| 116 |
+
result.append(msg)
|
| 117 |
+
return result
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# ── Core Logic ──
|
| 121 |
+
|
| 122 |
+
async def process_request(
|
| 123 |
+
messages: List[Dict[str, Any]],
|
| 124 |
+
image_path_list: List[str],
|
| 125 |
+
) -> Dict[str, Any]:
|
| 126 |
+
|
| 127 |
+
user_content = build_user_content(messages, image_path_list)
|
| 128 |
+
payload = build_initial_payload(user_content)
|
| 129 |
+
|
| 130 |
+
async with httpx.AsyncClient(timeout=300.0) as client:
|
| 131 |
+
|
| 132 |
+
# 1. Create Jupyter session
|
| 133 |
+
try:
|
| 134 |
+
r = await client.post(
|
| 135 |
+
f"{JUPYTER_API}/sessions/create",
|
| 136 |
+
json={"kernel_name": "python3.10"},
|
| 137 |
+
)
|
| 138 |
+
r.raise_for_status()
|
| 139 |
+
session_id = r.json()["data"]["session_id"]
|
| 140 |
+
except Exception as e:
|
| 141 |
+
raise HTTPException(500, f"Failed to create Jupyter session: {e}")
|
| 142 |
+
|
| 143 |
+
try:
|
| 144 |
+
for turn in range(1, MAX_TURNS + 1):
|
| 145 |
+
|
| 146 |
+
# 2. Call the model
|
| 147 |
+
try:
|
| 148 |
+
r = await client.post(
|
| 149 |
+
CHAT_API,
|
| 150 |
+
json=payload,
|
| 151 |
+
timeout=120.0,
|
| 152 |
+
)
|
| 153 |
+
r.raise_for_status()
|
| 154 |
+
resp = r.json()
|
| 155 |
+
except Exception as e:
|
| 156 |
+
raise HTTPException(500, f"Model API request failed (turn={turn}): {e}")
|
| 157 |
+
|
| 158 |
+
if "choices" not in resp:
|
| 159 |
+
raise HTTPException(500, f"Unexpected model response: {resp}")
|
| 160 |
+
|
| 161 |
+
choice = resp["choices"][0]["message"]
|
| 162 |
+
thinking = (choice.get("reasoning") or "").strip()
|
| 163 |
+
answer = choice["content"].strip()
|
| 164 |
+
assistant_msg = f"<think>\n{thinking}\n</think>\n\n{answer}"
|
| 165 |
+
|
| 166 |
+
# 3. No tool call — conversation complete
|
| 167 |
+
if "<tool_call>" not in answer:
|
| 168 |
+
payload["messages"].append({"role": "assistant", "content": assistant_msg})
|
| 169 |
+
break
|
| 170 |
+
|
| 171 |
+
# 4. Parse and execute the tool call
|
| 172 |
+
try:
|
| 173 |
+
raw = answer.split("<tool_call>")[1].split("</tool_call>")[0]
|
| 174 |
+
code = json.loads(raw)["arguments"]["code"]
|
| 175 |
+
except Exception as e:
|
| 176 |
+
raise HTTPException(500, f"Failed to parse tool_call: {e}")
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
r = await client.post(
|
| 180 |
+
f"{JUPYTER_API}/execute",
|
| 181 |
+
json={"code": code, "timeout": 30,
|
| 182 |
+
"kernel_name": "python3.10", "session_id": session_id},
|
| 183 |
+
timeout=60.0,
|
| 184 |
+
)
|
| 185 |
+
r.raise_for_status()
|
| 186 |
+
exec_res = r.json()
|
| 187 |
+
except Exception as e:
|
| 188 |
+
raise HTTPException(500, f"Code execution failed: {e}")
|
| 189 |
+
|
| 190 |
+
# Skip this turn if execution failed
|
| 191 |
+
if not exec_res["success"]:
|
| 192 |
+
continue
|
| 193 |
+
|
| 194 |
+
sandbox_img_path = exec_res["data"]["outputs"][0]["text"].strip()
|
| 195 |
+
host_img_path = to_host_path(sandbox_img_path)
|
| 196 |
+
image_path_list.append(host_img_path)
|
| 197 |
+
img_b64 = f"data:image/jpeg;base64,{encode_image(host_img_path)}"
|
| 198 |
+
|
| 199 |
+
payload["messages"].append({"role": "assistant", "content": assistant_msg})
|
| 200 |
+
payload["messages"].append({
|
| 201 |
+
"role": "user",
|
| 202 |
+
"content": [
|
| 203 |
+
{"type": "text", "text": "<tool_response>\n"},
|
| 204 |
+
{"type": "image_url", "image_url": {"url": img_b64}},
|
| 205 |
+
{"type": "text", "text": f"\n{sandbox_img_path}\n</tool_response>"},
|
| 206 |
+
],
|
| 207 |
+
})
|
| 208 |
+
|
| 209 |
+
finally:
|
| 210 |
+
# 5. Clean up the Jupyter session
|
| 211 |
+
try:
|
| 212 |
+
await client.delete(f"{JUPYTER_API}/sessions/{session_id}")
|
| 213 |
+
except Exception as e:
|
| 214 |
+
print(f"[WARN] Failed to delete Jupyter session: {e}")
|
| 215 |
+
|
| 216 |
+
payload["messages"] = messages_to_text(payload["messages"])
|
| 217 |
+
payload["image_path_list"] = image_path_list
|
| 218 |
+
return payload
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
# ── Routes ──
|
| 222 |
+
|
| 223 |
+
@app.post("/process")
|
| 224 |
+
async def process_images(request: RequestModel) -> Dict[str, Any]:
|
| 225 |
+
return await process_request(request.messages, request.image_path_list)
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
@app.get("/health")
|
| 229 |
+
async def health_check():
|
| 230 |
+
return {"status": "ok"}
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
# ── Entrypoint ──
|
| 234 |
+
|
| 235 |
+
if __name__ == "__main__":
|
| 236 |
+
import uvicorn
|
| 237 |
+
uvicorn.run(app, host="0.0.0.0", port=10044)
|