manpk-ai commited on
Commit
d4d15db
·
verified ·
0 Parent(s):

Duplicate from manpk-ai/OpenCUA-32B-Inference

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +35 -0
  2. README.md +411 -0
  3. config.json +69 -0
  4. configuration_opencua.py +37 -0
  5. generation_config.json +4 -0
  6. handler.py +171 -0
  7. model-1-of-64.safetensors +3 -0
  8. model-10-of-64.safetensors +3 -0
  9. model-11-of-64.safetensors +3 -0
  10. model-12-of-64.safetensors +3 -0
  11. model-13-of-64.safetensors +3 -0
  12. model-14-of-64.safetensors +3 -0
  13. model-15-of-64.safetensors +3 -0
  14. model-16-of-64.safetensors +3 -0
  15. model-17-of-64.safetensors +3 -0
  16. model-18-of-64.safetensors +3 -0
  17. model-19-of-64.safetensors +3 -0
  18. model-2-of-64.safetensors +3 -0
  19. model-20-of-64.safetensors +3 -0
  20. model-21-of-64.safetensors +3 -0
  21. model-22-of-64.safetensors +3 -0
  22. model-23-of-64.safetensors +3 -0
  23. model-24-of-64.safetensors +3 -0
  24. model-25-of-64.safetensors +3 -0
  25. model-26-of-64.safetensors +3 -0
  26. model-27-of-64.safetensors +3 -0
  27. model-28-of-64.safetensors +3 -0
  28. model-29-of-64.safetensors +3 -0
  29. model-3-of-64.safetensors +3 -0
  30. model-30-of-64.safetensors +3 -0
  31. model-31-of-64.safetensors +3 -0
  32. model-32-of-64.safetensors +3 -0
  33. model-33-of-64.safetensors +3 -0
  34. model-34-of-64.safetensors +3 -0
  35. model-35-of-64.safetensors +3 -0
  36. model-36-of-64.safetensors +3 -0
  37. model-37-of-64.safetensors +3 -0
  38. model-38-of-64.safetensors +3 -0
  39. model-39-of-64.safetensors +3 -0
  40. model-4-of-64.safetensors +3 -0
  41. model-40-of-64.safetensors +3 -0
  42. model-41-of-64.safetensors +3 -0
  43. model-42-of-64.safetensors +3 -0
  44. model-43-of-64.safetensors +3 -0
  45. model-44-of-64.safetensors +3 -0
  46. model-45-of-64.safetensors +3 -0
  47. model-46-of-64.safetensors +3 -0
  48. model-47-of-64.safetensors +3 -0
  49. model-48-of-64.safetensors +3 -0
  50. model-49-of-64.safetensors +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model:
3
+ - Qwen/Qwen2.5-VL-32B-Instruct
4
+ datasets:
5
+ - xlangai/AgentNet
6
+ - xlangai/aguvis-stage1
7
+ - xlangai/aguvis-stage2
8
+ - osunlp/UGround-V1-Data
9
+ language:
10
+ - en
11
+ license: mit
12
+ metrics:
13
+ - code_eval
14
+ - accuracy
15
+ pipeline_tag: image-text-to-text
16
+ tags:
17
+ - VLM
18
+ - Computer-Use-Agent
19
+ - OS-Agent
20
+ - GUI
21
+ - Grounding
22
+ library_name: transformers
23
+ ---
24
+
25
+ <h1 style="
26
+ font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',Helvetica,Arial,sans-serif;
27
+ font-size:48px;
28
+ font-weight:700;
29
+ line-height:1.25;
30
+ text-align:center;
31
+ margin:0 0 24px;">
32
+ OpenCUA: Open Foundations for Computer-Use Agents
33
+ </h1>
34
+
35
+ <div style="
36
+ display:flex;
37
+ justify-content:center;
38
+ gap:12px;
39
+ flex-wrap:wrap;
40
+ margin-bottom:28px;">
41
+
42
+ <a href="https://opencua.xlang.ai/" style="
43
+ display:inline-block;
44
+ padding:8px 24px;
45
+ background:#2b2b2b;
46
+ color:#ffffff;
47
+ border-radius:36px;
48
+ text-decoration:none;
49
+ font-weight:600;
50
+ font-size:16px;">
51
+ 🌐 Website
52
+ </a>
53
+
54
+ <a href="https://arxiv.org/abs/2508.09123" style="
55
+ display:inline-block;
56
+ padding:8px 24px;
57
+ background:#2b2b2b;
58
+ color:#ffffff;
59
+ border-radius:36px;
60
+ text-decoration:none;
61
+ font-weight:600;
62
+ font-size:16px;">
63
+ 📝 Paper
64
+ </a>
65
+
66
+ <a href="https://github.com/xlang-ai/OpenCUA" style="
67
+ display:inline-block;
68
+ padding:8px 24px;
69
+ background:#2b2b2b;
70
+ color:#ffffff;
71
+ border-radius:36px;
72
+ text-decoration:none;
73
+ font-weight:600;
74
+ font-size:16px;">
75
+ 💻 Code
76
+ </a>
77
+ </div>
78
+
79
+ <div style="max-width:900px;margin:0 auto;">
80
+
81
+ # Introduction
82
+ <div style="
83
+ max-width: 880px; /* 可按需调节整体宽度 */
84
+ margin: 0 auto; /* 居中容器 */
85
+ text-align: justify; /* 关键:两端对齐 */
86
+ text-justify: inter-word; /* 优化英文对齐效果 */
87
+ line-height: 1.6;">
88
+
89
+ OpenCUA models (OpenCUA-7B and OpenCUA-32B) are end-to-end computer-use foundation models than can produce executable actions in the computer environments. They are based on the weights of Qwen2.5-VL-7B-Instruction and Qwen2.5-VL-32B-Instruction.
90
+ They demonstrate superior performance across CUA benchmarks. In particular, <b>OpenCUA-32B</b> achieves an average success rate of **34.8%** on [OSWorld-Verified](https://os-world.github.io/),
91
+ establishing a new state-of-the-art (SOTA) among open-source models and surpassing OpenAI CUA (GPT-4o). Both models also have strong grounding performance, OpenCUA-32B achieves 59.6% on [OSWorld-G](https://osworld-grounding.github.io/) and 55.3% on [Screenspot-Pro](https://arxiv.org/abs/2504.07981).
92
+ </div>
93
+
94
+ ### Key Features
95
+
96
+ - **Superior Computer-Use Capablity**: Able to execute multi-step computer-use actions with effective planning and reasoning
97
+ - **Multi-OS Support**: Trained on demonstrations across Ubuntu, Windows, and macOS
98
+ - **Visual Grounding**: Strong GUI element recognition and spatial reasoning capabilities
99
+ - **Multi-Image Context**: Processes up to 3 screenshot history for better context understanding
100
+ - **Reflective Reasoning**: Enhanced with reflective long Chain-of-Thought that identifies errors and provides corrective reasoning
101
+
102
+
103
+ # Performance
104
+
105
+ ### Online Agent Evaluation
106
+ OpenCUA models achieves strong performance on **[OSWorld-Verified](https://os-world.github.io/)**.
107
+ OPENCUA-32B achieves the best performance among all open-source models with an average success rate of 34.8%, outperforming prior baselines by large margins.
108
+ It also closes the gap to proprietary Claude models.
109
+ <div align="center">
110
+
111
+ | **Model** | **15 Steps** | **50 Steps** | **100 Steps** |
112
+ |-------------------------------|:--------:|:--------:|:---------:|
113
+ | **Proprietary** | | | |
114
+ | OpenAI CUA | 26.0 | 31.3 | 31.4 |
115
+ | Seed 1.5-VL | 27.9 | — | 34.1 |
116
+ | Claude 3.7 Sonnet | 27.1 | 35.8 | 35.9 |
117
+ | Claude 4 Sonnet | 31.2 | 43.9 | 41.5 |
118
+ | **Open-Source** | | | |
119
+ | Qwen 2.5-VL-32B-Instruct | 3.0 | — | 3.9 |
120
+ | Qwen 2.5-VL-72B-Instruct | 4.4 | — | 5.0 |
121
+ | Kimi-VL-A3B | 9.7 | — | 10.3 |
122
+ | UI-TARS-72B-DPO | 24.0 | 25.8 | 27.1 |
123
+ | UI-TARS-1.5-7B | 24.5 | 27.3 | 27.4 |
124
+ | OpenCUA-7B *(Ours)* | 24.3 | 27.9 | 26.6 |
125
+ | **OpenCUA-32B *(Ours)*** | **29.7** | **34.1** | **34.8** |
126
+ </div>
127
+
128
+ *OpenCUA scores are the mean of 3 independent runs.*
129
+
130
+ ### GUI Grounding Performance
131
+ <div align="center">
132
+
133
+ | **Model** | **OSWorld-G** | **ScreenSpot-V2** | **ScreenSpot-Pro** |
134
+ |-------|-----------|---------------|----------------|
135
+ | Qwen2.5-VL-7B | 31.4 | 88.8 | 27.6 |
136
+ | Qwen2.5-VL-32B | 46.5 | 87.0 | 39.4 |
137
+ | UI-TARS-72B | 57.1 | 90.3 | 38.1 |
138
+ | **OpenCUA-A3B** | 48.6 | 91.4 | 28.5 |
139
+ | **OpenCUA-Qwen2-7B** | 45.7 | 88.5 | 23.7 |
140
+ | **OpenCUA-7B** | 55.3 | 92.3 | 50.0 |
141
+ | **OpenCUA-32B** | **59.6** | **93.4** | **55.3** |
142
+ </div>
143
+
144
+
145
+ ### AgentNetBench (Offline Evaluation)
146
+ <div align="center">
147
+
148
+ | **Model** | **Coordinate Actions** | **Content Actions** | **Function Actions** | **Average** |
149
+ |-------|-------------------|-----------------|------------------|---------|
150
+ | Qwen2.5-VL-7B | 50.7 | 40.8 | 3.1 | 48.0 |
151
+ | Qwen2.5-VL-32B | 66.6 | 47.2 | 41.5 | 64.8 |
152
+ | Qwen2.5-VL-72B | 67.2 | 52.6 | 50.5 | 67.0 |
153
+ | OpenAI CUA | 71.7 | 57.3 | **80.0** | 73.1 |
154
+ | **OpenCUA-7B** | 79.0 | 62.0 | 44.3 | 75.2 |
155
+ | **OpenCUA-32B** | **81.9** | 66.1 | 55.7 | **79.1** |
156
+ </div>
157
+
158
+ # 🚀 Quick Start
159
+ <div style="border-left: 6px solid #f28c28; background: #fff8e6; padding: 12px 16px; margin: 16px 0;">
160
+ <strong>⚠️ Important for Qwen-based Models (OpenCUA-7B, OpenCUA-32B):</strong>
161
+
162
+ To align with our training infrastructure, we have modified the model in two places:
163
+ <ul style="margin-top: 8px;">
164
+ <li>1. Multimodal Rotary Position Embedding (M-RoPE) has been replaced with 1D RoPE</strong>.</li>
165
+ <li>2. Using the same Tokenizer and ChatTemplate as Kimi-VL.</li>
166
+ <li>Do not use the default transformers and vllm classes to load the model. Tokenizer and Chat Template should be aligned if training the models.</li>
167
+ </ul>
168
+ </div>
169
+
170
+
171
+ ## Installation & Download
172
+
173
+ First, install the required transformers dependencies:
174
+
175
+ ```bash
176
+ conda create -n opencua python=3.10
177
+ conda activate opencua
178
+ pip install -r requirement.txt
179
+ ```
180
+
181
+ Download the model weight from huggingface:
182
+ ```bash
183
+ from huggingface_hub import snapshot_download
184
+ snapshot_download(
185
+ repo_id="xlangai/OpenCUA-32B",
186
+ local_dir="OpenCUA-32B",
187
+ local_dir_use_symlinks=False
188
+ )
189
+ ```
190
+
191
+ ## 🎯 GUI Grounding
192
+
193
+ The following code demonstrates how to use OpenCUA models for GUI grounding tasks:
194
+
195
+ ```python
196
+ import base64
197
+ import torch
198
+ from transformers import AutoTokenizer, AutoModel, AutoImageProcessor
199
+ from PIL import Image
200
+ import json
201
+
202
+ def encode_image(image_path: str) -> str:
203
+ """Encode image to base64 string for model input."""
204
+ with open(image_path, "rb") as f:
205
+ return base64.b64encode(f.read()).decode()
206
+
207
+ def load_opencua_model(model_path: str):
208
+ """Load OpenCUA model, tokenizer, and image processor."""
209
+ tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
210
+ model = AutoModel.from_pretrained(
211
+ model_path,
212
+ torch_dtype="auto",
213
+ device_map="auto",
214
+ trust_remote_code=True
215
+ )
216
+ image_processor = AutoImageProcessor.from_pretrained(model_path, trust_remote_code=True)
217
+
218
+ return model, tokenizer, image_processor
219
+
220
+ def create_grounding_messages(image_path: str, instruction: str):
221
+ """Create chat messages for GUI grounding task."""
222
+ system_prompt = (
223
+ "You are a GUI agent. You are given a task and a screenshot of the screen. "
224
+ "You need to perform a series of pyautogui actions to complete the task."
225
+ )
226
+
227
+ messages = [
228
+ {"role": "system", "content": system_prompt},
229
+ {
230
+ "role": "user",
231
+ "content": [
232
+ {"type": "image", "image": f"data:image/png;base64,{encode_image(image_path)}"},
233
+ {"type": "text", "text": instruction},
234
+ ],
235
+ },
236
+ ]
237
+ return messages
238
+
239
+ def run_inference(model, tokenizer, image_processor, messages, image_path):
240
+ """Run inference on the model."""
241
+ # Prepare text input
242
+ input_ids = tokenizer.apply_chat_template(
243
+ messages, tokenize=True, add_generation_prompt=True
244
+ )
245
+ input_ids = torch.tensor([input_ids]).to(model.device)
246
+
247
+ # Prepare image input
248
+ image = Image.open(image_path).convert('RGB')
249
+ image_info = image_processor.preprocess(images=[image])
250
+ pixel_values = torch.tensor(image_info['pixel_values']).to(
251
+ dtype=torch.bfloat16, device=model.device
252
+ )
253
+ grid_thws = torch.tensor(image_info['image_grid_thw'])
254
+
255
+ # Generate response
256
+ with torch.no_grad():
257
+ generated_ids = model.generate(
258
+ input_ids,
259
+ pixel_values=pixel_values,
260
+ grid_thws=grid_thws,
261
+ max_new_tokens=512,
262
+ temperature=0
263
+ )
264
+
265
+ # Decode output
266
+ prompt_len = input_ids.shape[1]
267
+ generated_ids = generated_ids[:, prompt_len:]
268
+ output_text = tokenizer.batch_decode(
269
+ generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
270
+ )[0]
271
+
272
+ return output_text
273
+
274
+ # Example usage
275
+ model_path = "xlangai/OpenCUA-32B" # or other model variants
276
+ image_path = "screenshot.png"
277
+ instruction = "Click on the submit button"
278
+
279
+ # Load model
280
+ model, tokenizer, image_processor = load_opencua_model(model_path)
281
+
282
+ # Create messages and run inference
283
+ messages = create_grounding_messages(image_path, instruction)
284
+ result = run_inference(model, tokenizer, image_processor, messages, image_path)
285
+
286
+ print("Model output:", result)
287
+ ```
288
+
289
+ <div style="border-left: 6px solid #9ca3af; background: #f5f5f5; padding: 12px 16px; margin: 16px 0;">
290
+ <em>Expected result: ```python
291
+ pyautogui.click(x=1432, y=344)
292
+ ```</em>
293
+ </div>
294
+
295
+ ## 🖥️ Computer Use Agent
296
+ **[OpenCUAAgent](https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/opencua_agent.py)** is developed in the [OSWorld](https://github.com/xlang-ai/OSWorld) environment based on OpenCUA models. It iteratively perceives the environment via screenshots, produces reflective long CoT as inner monologue, and predicts the next action to be executed. OpenCUAAgent uses 3 images in total and L2 CoT format in default.
297
+
298
+ Command for running OpenCUA-7B and OpenCUA-32B in OSWorld:
299
+ ```
300
+ python run_multienv_opencua.py \
301
+ --headless \
302
+ --observation_type screenshot \
303
+ --model OpenCUA-32B \
304
+ --result_dir ./results --test_all_meta_path evaluation_examples/test_all_no_gdrive.json \
305
+ --max_steps 100 \
306
+ --num_envs 30 \
307
+ --coordinate_type qwen25
308
+ ```
309
+ <div style="border-left: 6px solid #9ca3af; background: #f5f5f5; padding: 12px 16px; margin: 16px 0;">
310
+ <em>Currently we only supports huggingface inference. We are implementing the vLLM supports of OpenCUA models. Please stay tuned.</em>
311
+ </div>
312
+
313
+ ## Important Notes on Coordinate Systems
314
+ <div style="border-left: 6px solid #9ca3af; background: #f5f5f5; padding: 12px 16px; margin: 16px 0;">
315
+ <ul style="margin: 0;">
316
+ <li><strong><code>xlangai/OpenCUA-A3B</code></strong> – Relative coordinates <em>(not supported in this code)</em></li>
317
+ <li><strong><code>xlangai/OpenCUA-Qwen2-7B</code></strong> – Relative coordinates</li>
318
+ <li><strong><code>xlangai/OpenCUA-7B</code></strong> – Absolute coordinates</li>
319
+ <li><strong><code>xlangai/OpenCUA-32B</code></strong> – Absolute coordinates</li>
320
+ </ul>
321
+ </div>
322
+
323
+ **OpenCUA models use different coordinate systems depending on the base model:**
324
+
325
+ - **OpenCUA-Qwen2-7B**: Outputs **relative coordinates** (0.0 to 1.0 range)
326
+ ```python
327
+ # Example output: pyautogui.click(x=0.5, y=0.3)
328
+ # x=0.5 means 50% from left edge, y=0.3 means 30% from top edge
329
+
330
+ # Convert to absolute coordinates:
331
+ def qwen2_relative_to_absolute(rel_x, rel_y, original_width, original_height):
332
+ abs_x = int(rel_x * original_width)
333
+ abs_y = int(rel_y * original_height)
334
+ return abs_x, abs_y
335
+ ```
336
+
337
+ - **OpenCUA-7B and OpenCUA-32B** (Qwen2.5-based): Output **absolute coordinates** after smart resize
338
+ ```python
339
+ # Example output: pyautogui.click(x=960, y=324)
340
+ # These are coordinates on the smart-resized image, not the original image
341
+
342
+ # Convert to original image coordinates:
343
+ # Please refer to the smart_resize function in: https://github.com/huggingface/transformers/blob/67ddc82fbc7e52c6f42a395b4a6d278c55b77a39/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py#L55
344
+ def qwen25_smart_resize_to_absolute(model_x, model_y, original_width, original_height):
345
+ # First, calculate the smart-resized dimensions
346
+ resized_height, resized_width = smart_resize(original_height, original_width, factor = 28, min_pixels = 3136, max_pixels = 12845056)
347
+
348
+ # Convert model output to relative coordinates on original image
349
+ rel_x = model_x / resized_width
350
+ rel_y = model_y / resized_height
351
+
352
+ # Then convert to absolute coordinates on original image
353
+ abs_x = int(rel_x * original_width)
354
+ abs_y = int(rel_y * original_height)
355
+ return abs_x, abs_y
356
+ ```
357
+
358
+ <div style="border-left: 6px solid #9ca3af; background: #f5f5f5; padding: 12px 16px; margin: 16px 0;">
359
+ <strong>Understanding Smart Resize for Qwen2.5-based Models:</strong>
360
+ <p style="margin: 8px 0 0;">
361
+ The Qwen2.5-VL models use a “smart resize” preprocessing that maintains aspect ratio while fitting within pixel constraints.
362
+ For coordinate conversion, you need the smart resize function from the
363
+ <a href="https://github.com/QwenLM/Qwen2.5-VL/blob/d2240f11656bfe404b9ba56db4e51cd09f522ff1/qwen-vl-utils/src/qwen_vl_utils/vision_process.py#L60">
364
+ official Qwen2.5-VL implementation</a>.
365
+ </p>
366
+ </div>
367
+
368
+
369
+ # TODO
370
+ ## vLLM Support
371
+ We are actively working with the vLLM team to add support for OpenCUA models.
372
+
373
+ **Workaround:** For now, please use the standard transformers library as shown in the examples above. We will update this section once vLLM support becomes available.
374
+
375
+ ## Training Code
376
+ OpenCUA models are developed based on the training infrastructure of Kimi Team. We are developting the training pipeline based on the open-source infrastructure as well.
377
+
378
+ ## License
379
+
380
+ This project is licensed under the MIT License - see the LICENSE file in the root folder for details.
381
+
382
+ ## Research Use and Disclaimer
383
+
384
+ OpenCUA models are intended for **research and educational purposes only**.
385
+
386
+ ### Prohibited Uses
387
+ - The model may **not** be used for any purpose or activity that violates applicable laws or regulations in any jurisdiction
388
+ - Use for illegal, unethical, or harmful activities is strictly prohibited
389
+
390
+ ### Disclaimer
391
+ - The authors, contributors, and copyright holders are **not responsible** for any illegal, unethical, or harmful use of the Software, nor for any direct or indirect damages resulting from such use
392
+ - Use of the "OpenCUA" name, logo, or trademarks does **not** imply any endorsement or affiliation unless separate written permission is obtained
393
+ - Users are solely responsible for ensuring their use complies with applicable laws and regulations
394
+
395
+ ## Citation
396
+
397
+ If you use OpenCUA models in your research, please cite our work:
398
+
399
+ ```bibtex
400
+ @misc{wang2025opencuaopenfoundationscomputeruse,
401
+ title={OpenCUA: Open Foundations for Computer-Use Agents},
402
+ author={Xinyuan Wang and Bowen Wang and Dunjie Lu and Junlin Yang and Tianbao Xie and Junli Wang and Jiaqi Deng and Xiaole Guo and Yiheng Xu and Chen Henry Wu and Zhennan Shen and Zhuokai Li and Ryan Li and Xiaochuan Li and Junda Chen and Boyuan Zheng and Peihang Li and Fangyu Lei and Ruisheng Cao and Yeqiao Fu and Dongchan Shin and Martin Shin and Jiarui Hu and Yuyan Wang and Jixuan Chen and Yuxiao Ye and Danyang Zhang and Dikang Du and Hao Hu and Huarong Chen and Zaida Zhou and Haotian Yao and Ziwei Chen and Qizheng Gu and Yipu Wang and Heng Wang and Diyi Yang and Victor Zhong and Flood Sung and Y. Charles and Zhilin Yang and Tao Yu},
403
+ year={2025},
404
+ eprint={2508.09123},
405
+ archivePrefix={arXiv},
406
+ primaryClass={cs.AI},
407
+ url={https://arxiv.org/abs/2508.09123},
408
+ }
409
+ ```
410
+
411
+ </div>
config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "OpenCUAForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_opencua.OpenCUAConfig",
7
+ "AutoModel": "modeling_opencua.OpenCUAForConditionalGeneration",
8
+ "AutoModelForCausalLM": "modeling_opencua.OpenCUAForConditionalGeneration"
9
+ },
10
+ "ignore_index": -100,
11
+ "media_placeholder_token_id": 151664,
12
+ "model_type": "opencua",
13
+ "pad_token_id": 0,
14
+ "text_config": {
15
+ "bos_token_id": 151643,
16
+ "eos_token_id": 151644,
17
+ "head_dim": 128,
18
+ "hidden_act": "silu",
19
+ "hidden_size": 5120,
20
+ "initializer_range": 0.02,
21
+ "intermediate_size": 27648,
22
+ "k_proj_bias": true,
23
+ "max_length": 20,
24
+ "min_length": 0,
25
+ "model_type": "qwen2",
26
+ "num_attention_heads": 40,
27
+ "num_beam_groups": 1,
28
+ "num_beams": 1,
29
+ "num_hidden_layers": 64,
30
+ "num_key_value_heads": 8,
31
+ "pad_token_id": 152063,
32
+ "pretraining_sequence_length": 131072,
33
+ "q_proj_bias": true,
34
+ "rms_norm_eps": 1e-05,
35
+ "rope_theta": 1000000.0,
36
+ "tie_word_embeddings": false,
37
+ "torch_dtype": "bfloat16",
38
+ "use_bfloat16": false,
39
+ "use_cache": true,
40
+ "v_proj_bias": true,
41
+ "vocab_size": 152064
42
+ },
43
+ "tie_word_embeddings": false,
44
+ "torch_dtype": "bfloat16",
45
+ "transformers_version": "4.48.3",
46
+ "vision_config": {
47
+ "depth": 32,
48
+ "fullatt_block_indexes": [
49
+ 7,
50
+ 15,
51
+ 23,
52
+ 31
53
+ ],
54
+ "hidden_act": "silu",
55
+ "hidden_size": 1280,
56
+ "num_heads": 16,
57
+ "in_chans": 3,
58
+ "intermediate_size": 3456,
59
+
60
+ "patch_size": 14,
61
+ "spatial_merge_size": 2,
62
+ "spatial_patch_size": 14,
63
+ "temporal_patch_size": 2,
64
+ "out_hidden_size": 5120,
65
+ "tokens_per_second": 2,
66
+ "window_size": 112
67
+ },
68
+ "vocab_size": 152064
69
+ }
configuration_opencua.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers.configuration_utils import PretrainedConfig
2
+ from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
3
+ from transformers.models.qwen2.configuration_qwen2 import Qwen2Config
4
+
5
+
6
+ class OpenCUAConfig(PretrainedConfig):
7
+ """OpenCUA-2.5-32B model configuration.
8
+
9
+ Args:
10
+ vision_config: Configuration for the vision model.Qwen2_5_VLVisionConfig
11
+ text_config: Configuration for the text model. Qwen2Config
12
+ pad_token_id: The token ID to use for padding.
13
+ """
14
+
15
+ model_type = "opencua"
16
+
17
+ def __init__(
18
+ self,
19
+ vision_config: dict | Qwen2_5_VLVisionConfig | None = None,
20
+ text_config: dict | Qwen2Config | None = None,
21
+ ignore_index: int = -100,
22
+ media_placeholder_token_id: int = 151664,
23
+ pad_token_id: int = 0,
24
+ **kwargs
25
+ ):
26
+ if isinstance(vision_config, dict):
27
+ vision_config = Qwen2_5_VLVisionConfig(**vision_config)
28
+ self.vision_config = vision_config
29
+
30
+ if isinstance(text_config, dict):
31
+ text_config = Qwen2Config(**text_config)
32
+ self.text_config = text_config
33
+
34
+ self.ignore_index = ignore_index
35
+ self.media_placeholder_token_id = media_placeholder_token_id
36
+
37
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
generation_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_length": 32768,
3
+ "eos_token_id": 151644
4
+ }
handler.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ from transformers import (
3
+ AutoTokenizer,
4
+ AutoModel,
5
+ AutoImageProcessor,
6
+ )
7
+ import torch
8
+ from PIL import Image
9
+ import base64
10
+ import io
11
+
12
+ # get dtype and device
13
+ dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
14
+ device = "cuda" if torch.cuda.is_available() else "cpu"
15
+
16
+ class EndpointHandler():
17
+ def __init__(self, path=""):
18
+ print(f"Initializing model on device: {device}")
19
+ print(f"Using dtype: {dtype}")
20
+
21
+ # load the model - using AutoModel like in local inference
22
+ self.tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)
23
+ self.image_processor = AutoImageProcessor.from_pretrained(path, trust_remote_code=True)
24
+
25
+ # Load model with explicit device mapping
26
+ if device == "cuda":
27
+ self.model = AutoModel.from_pretrained(
28
+ path,
29
+ torch_dtype=dtype,
30
+ trust_remote_code=True,
31
+ device_map="auto" # Automatically map to available GPUs
32
+ )
33
+ else:
34
+ self.model = AutoModel.from_pretrained(
35
+ path,
36
+ torch_dtype=dtype,
37
+ trust_remote_code=True
38
+ )
39
+ self.model = self.model.to(device)
40
+
41
+ print(f"Model loaded successfully on device: {self.model.device}")
42
+ print(f"Model dtype: {next(self.model.parameters()).dtype}")
43
+
44
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
45
+ """
46
+ data args:
47
+ inputs (:obj: `str` or `list`): messages in chat format or text input
48
+ parameters (:obj: `dict`): generation parameters
49
+ Return:
50
+ A :obj:`list` | `dict`: will be serialized and returned
51
+ """
52
+ print("Call inside handler")
53
+ # get inputs
54
+ inputs = data.pop("inputs", data)
55
+ parameters = data.pop("parameters", {})
56
+ print("parameters", parameters)
57
+
58
+ # Remove parameters that might cause issues
59
+ parameters.pop("details", None)
60
+ parameters.pop("stop", None)
61
+ parameters.pop("return_full_text", None)
62
+ if "do_sample" in parameters:
63
+ parameters["do_sample"] = True
64
+
65
+ # Set default generation parameters
66
+ max_new_tokens = parameters.pop("max_new_tokens", 512)
67
+ temperature = parameters.pop("temperature", 0)
68
+
69
+ try:
70
+ # Handle different input formats
71
+ if isinstance(inputs, str):
72
+ # If it's a string, treat it as a simple text prompt
73
+ input_ids = self.tokenizer.encode(inputs, return_tensors="pt").to(self.model.device)
74
+ generated_ids = self.model.generate(
75
+ input_ids,
76
+ max_new_tokens=max_new_tokens,
77
+ temperature=temperature,
78
+ **parameters
79
+ )
80
+ prompt_len = input_ids.shape[1]
81
+ generated_ids = generated_ids[:, prompt_len:]
82
+ output_text = self.tokenizer.batch_decode(
83
+ generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
84
+ )
85
+ return [{"generated_text": output_text[0]}]
86
+
87
+ elif isinstance(inputs, list):
88
+ # Handle chat format with images
89
+ messages = inputs
90
+
91
+ # Apply chat template
92
+ input_ids = self.tokenizer.apply_chat_template(
93
+ messages, tokenize=True, add_generation_prompt=True
94
+ )
95
+ input_text = self.tokenizer.decode(input_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False)
96
+ print(input_text)
97
+
98
+ input_ids = torch.tensor([input_ids]).to(self.model.device)
99
+
100
+ # Process ALL images if present
101
+ pixel_values_list = []
102
+ grid_thws_list = []
103
+
104
+ # Look for images in the messages
105
+ for message in messages:
106
+ if isinstance(message.get("content"), list):
107
+ for content_item in message["content"]:
108
+ if content_item.get("type") == "image_url":
109
+ image_data = content_item.get("image_url").get("url", "")
110
+ if image_data.startswith("data:image"):
111
+ # Decode base64 image
112
+ image_data = image_data.split(",")[1]
113
+ image_bytes = base64.b64decode(image_data)
114
+ image = Image.open(io.BytesIO(image_bytes)).convert('RGB')
115
+
116
+ # Process each image individually
117
+ info = self.image_processor.preprocess(images=[image])
118
+ pixel_values = torch.tensor(info['pixel_values']).to(dtype=dtype, device=self.model.device)
119
+ grid_thws = torch.tensor(info['image_grid_thw']).to(self.model.device)
120
+
121
+ pixel_values_list.append(pixel_values)
122
+ grid_thws_list.append(grid_thws)
123
+
124
+ # Generate response
125
+ if pixel_values_list and grid_thws_list:
126
+ # Multi-modal generation with images
127
+ # Concatenate all pixel_values and grid_thws for batch processing
128
+ all_pixel_values = torch.cat(pixel_values_list, dim=0)
129
+ all_grid_thws = torch.cat(grid_thws_list, dim=0)
130
+
131
+ print(f"Processing {len(pixel_values_list)} images")
132
+ print(f"pixel_values shape: {all_pixel_values.shape}")
133
+ print(f"grid_thws shape: {all_grid_thws.shape}")
134
+ print("grid_thws", all_grid_thws)
135
+
136
+ # Ensure all tensors are on the same device as the model
137
+ all_pixel_values = all_pixel_values.to(self.model.device)
138
+ all_grid_thws = all_grid_thws.to(self.model.device)
139
+
140
+ with torch.no_grad():
141
+ generated_ids = self.model.generate(
142
+ input_ids,
143
+ pixel_values=all_pixel_values,
144
+ grid_thws=all_grid_thws,
145
+ max_new_tokens=max_new_tokens,
146
+ temperature=temperature,
147
+ **parameters
148
+ )
149
+ else:
150
+ # Text-only generation
151
+ generated_ids = self.model.generate(
152
+ input_ids,
153
+ max_new_tokens=max_new_tokens,
154
+ temperature=temperature,
155
+ **parameters
156
+ )
157
+
158
+ prompt_len = input_ids.shape[1]
159
+ generated_ids = generated_ids[:, prompt_len:]
160
+ output_text = self.tokenizer.batch_decode(
161
+ generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
162
+ )
163
+ print("##Model Response##", output_text)
164
+ return [{"generated_text": output_text[0]}]
165
+
166
+ else:
167
+ raise ValueError(f"Unsupported input type: {type(inputs)}")
168
+
169
+ except Exception as e:
170
+ print(f"Error during inference: {str(e)}")
171
+ return [{"error": str(e)}]
model-1-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aca1939cb672391efc7d1776450497a468c7cb81c4b1e3ed1eea53af67b616b
3
+ size 3910076520
model-10-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:314e7614356d3c95e09e544e9fd47562d85951ab89dbee7ce6a287e063ecf293
3
+ size 975212312
model-11-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad1cf3ef5479dd3b80dd9481ddf61d286c0aff0df324e00eb780d0441c25ca6e
3
+ size 975212320
model-12-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:101a00c1f24a1384d2fc4392fc4e8a158f5af10178386bbd1d68f7b9c8f55f28
3
+ size 975212320
model-13-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d29d0f6d70549ea8e0fc5931ac5af1f8e3fed24341bfd0609968f0cd8af0883
3
+ size 975212320
model-14-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3829ef7b272767f8b7dbfaedfa6c646c970481d7adac8589bcc4eab83e205f1e
3
+ size 975212320
model-15-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54289f90134d13eb68235d633421b91f8523b7ab1d3bdcedb56abbc842e1abc4
3
+ size 975212320
model-16-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b58ab69762e4a80f82297f29e0181cd272da54c51824dfb0cf837bc84f66fbb
3
+ size 975212320
model-17-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc03b542261f4832f9cb76227d3a6ed57820b515b7621e5980e4781768bd6308
3
+ size 975212320
model-18-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7aa625aef03f270e160526c9d767f9223b49161715a60000970ba61da8002af6
3
+ size 975212320
model-19-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10f0e43ec131e47545673ccf8f82fb280b5cfe9eb8b176de09af3412b83ebd3c
3
+ size 975212320
model-2-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32e4ea7efa92c57f60465442d016cf63f3d91e21c667ce97180d7b461aeec2ee
3
+ size 975212312
model-20-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f55d2a8fb141e8f2680859419041dbf9ed9bf3476fe1b04ad63a1c80b515d2d1
3
+ size 975212320
model-21-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a44ba3081602764e1e60f2097fd3f643c6bedde6f61d34101e40c19ecdf9a845
3
+ size 975212320
model-22-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90147767925bbb504d648eac8677128aa77e201d8c005365f16e90042946d7fb
3
+ size 975212320
model-23-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e23cae09d39bc395bf12220c44ee49de74c2e2adb4bb74c42a31300404f6e97b
3
+ size 975212320
model-24-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db13aedf91d13cb8817816a746fa64e34d171e90ad8e1e2dda6c2cc311d3cb5d
3
+ size 975212320
model-25-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:238eb5f6b8452295aef90060bb2e6f72edebfd316f63eb0b4ded48aab7285c87
3
+ size 975212320
model-26-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:232379d9dafa70056ab2a6031226ddce4d0d2b5e8a49b6e60ef7e9722e84b087
3
+ size 975212320
model-27-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cf53d1a0e56fb2d110c32cbffa1e29bfa2ebab91ef5e338c236e8a1b4197725
3
+ size 975212320
model-28-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f77ad190724c9426fdaa7e4394f69cde049e4890425543d1e4fd8ff2ca949bb0
3
+ size 975212320
model-29-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0754b041c3d5cfc3dea0161698e788139329f71866e0de83d2836fe9ac67c671
3
+ size 975212320
model-3-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a90489efdefc82a92eab3393dda904cf81ed0c2f72e76af3f2bf26c7f53da370
3
+ size 975212312
model-30-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fdd7bbde153529db81a39a92a6702ce5bee72aedf4c90e1db484d66966f375bb
3
+ size 975212320
model-31-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4e51a5b5edfd8ccb34a2ddde73bf787991b7ca7fb17bf71aa9eaa2059a320552
3
+ size 975212320
model-32-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96e43de41ad3132d829f9639840f0da60ac3388e891379c57358e07718230c8d
3
+ size 975212320
model-33-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13ba5a591d8e06a48b1e808ffdb5a42df90139b15e8df42233614c92ccc3f6b5
3
+ size 975212320
model-34-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08a2fe4e193b08d56c60d4511e9647cca566d0042317959c816e0698a7e31724
3
+ size 975212320
model-35-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:504b5985bc286473c0b8fb7708873492f0849fc127bc568aad2cb6a722860de2
3
+ size 975212320
model-36-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8fa50a75d040f40481eedc2fc539ad1077f8233b0041be408ae3504b03b67fd
3
+ size 975212320
model-37-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12dc0628c16279a89233334470722c31f3932a83968563f4e8d42c68677ba5d4
3
+ size 975212320
model-38-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d942860b48b66ced151d242caaf4439e7cb62061ea84dd41395b43b08e5aa536
3
+ size 975212320
model-39-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8cf8c139d14df728793a85e6c4e947f13816e3da2f12df02382b8299580081bb
3
+ size 975212320
model-4-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9acdd3d8b5218fbf943de6cf948480aa19abe0776a46c044d644aab920ddfdac
3
+ size 975212312
model-40-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:510cd3e52d48e1175966d691ebc0739012b31fcf766a31d1d41c50ce011a7c60
3
+ size 975212320
model-41-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2467e759d9ca1d233b04f9b751f012732a4db2ba58c91dccf00df81b9fe0a63
3
+ size 975212320
model-42-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:767ee823997ecda01d8841c6f8277bbc33adfe07639c954b5b29c04f1a7d1942
3
+ size 975212320
model-43-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:497e9690ec2b244e6cd5d6ed8e905dbc25308b46fd750fec2da5bca4b2ae0d6f
3
+ size 975212320
model-44-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f5eb867d8ded2a16569f2d681a26f336acce5198bef1310c56a7a231104a978
3
+ size 975212320
model-45-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb08c3f5843186b2757693bcdeffdf6bf3517edf70f803c764f1b9ff12d367ce
3
+ size 975212320
model-46-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe59b664845ef521a7980cf77dd2e03457686a3900bfbddc16a0a4685e3abc06
3
+ size 975212320
model-47-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32d200705d0f038dcdb56a0420541546cfb379f75a25673ccd53c4fc86abb25f
3
+ size 975212320
model-48-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d028ef374ea67dbd9237ffa7fed95567e3152b22e887a237f7130612c5e03957
3
+ size 975212320
model-49-of-64.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:795aa39683c39d7f37046ab3630e863d29837f0007d6013086bdfd92d48a7240
3
+ size 975212320