Instructions to use stepfun-ai/Step-3.7-Flash with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use stepfun-ai/Step-3.7-Flash with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="stepfun-ai/Step-3.7-Flash", trust_remote_code=True)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("stepfun-ai/Step-3.7-Flash", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use stepfun-ai/Step-3.7-Flash with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "stepfun-ai/Step-3.7-Flash"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "stepfun-ai/Step-3.7-Flash",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/stepfun-ai/Step-3.7-Flash

SGLang

How to use stepfun-ai/Step-3.7-Flash with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "stepfun-ai/Step-3.7-Flash" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "stepfun-ai/Step-3.7-Flash",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "stepfun-ai/Step-3.7-Flash" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "stepfun-ai/Step-3.7-Flash",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use stepfun-ai/Step-3.7-Flash with Docker Model Runner:
```
docker model run hf.co/stepfun-ai/Step-3.7-Flash
```

luotingdan commited on 3 days ago

Commit

a9c0171

1 Parent(s): 7805a18

update processor config

Browse files

Files changed (4) hide show

config.json +338 -338
configuration_step3p7.py +3 -15
modeling_step3p7.py +37 -27
processing_step3.py +11 -0

config.json CHANGED Viewed

@@ -1,345 +1,345 @@
 {
     "architectures": [
-      "Step3p7ForConditionalGeneration"
     ],
-    "auto_map": {
-      "AutoConfig": "configuration_step3p7.Step3p7Config",
-      "AutoModelForCausalLM": "modeling_step3p7.Step3p7ForConditionalGeneration"
     },
-    "model_type": "step3p7",
-    "im_end_token": "<im_end>",
-    "im_patch_token": "<im_patch>",
-    "im_start_token": "<im_start>",
-    "image_token_len": 169,
-    "patch_token_len": 81,
-    "image_token_id": 128001,
-    "understand_projector_stride": 2,
-    "use_im_start_end": "true",
-    "vision_select_layer": -1,
-    "projector_bias": false,
-    "vision_config": {
-      "model_type": "perception_encoder",
-      "image_size": 728,
-      "patch_size": 14,
-      "width": 1536,
-      "layers": 47,
-      "heads": 16,
-      "pool_type": "none",
-      "output_dim": null,
-      "use_cls_token": false,
-      "ls_init_value": 0.1,
-      "use_ln_post": false,
-      "hidden_act": "quick_gelu"
-    },
-    "text_config": {
-      "architectures": [
-        "Step3p5ForCausalLM"
-      ],
-      "rope_scaling": {
-        "rope_type": "llama3",
-        "factor": 2.0,
-        "original_max_position_embeddings": 131072,
-        "low_freq_factor": 1.0,
-        "high_freq_factor": 32.0
-      },
-      "yarn_only_types": [
-        "full_attention"
-      ],
-      "model_type": "step3p5",
-      "hidden_size": 4096,
-      "intermediate_size": 11264,
-      "num_hidden_layers": 45,
-      "max_seq_len": 262144,
-      "max_position_embeddings": 262144,
-      "vocab_size": 128896,
-      "torch_dtype": "bfloat16",
-      "use_qk_norm": false,
-      "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
-      "use_mfa": false,
-      "num_attention_heads": 64,
       "num_attention_groups": 8,
       "head_dim": 128,
-      "use_moe": true,
-      "moe_num_experts": 288,
-      "moe_top_k": 8,
-      "moe_intermediate_size": 1280,
-      "share_expert_dim": 1280,
-      "moe_layer_offset": 0,
-      "moe_every_n_layer": 1,
-      "norm_expert_weight": true,
-      "moe_router_activation": "sigmoid",
-      "moe_router_scaling_factor": 3.0,
-      "att_impl_type": "GQA",
-      "num_nextn_predict_layers": 3,
-      "rope_theta": [
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0,
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0,
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0,
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0,
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0,
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0,
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0,
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0,
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0,
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0,
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0,
-        5000000.0,
-        10000.0,
-        10000.0,
-        10000.0
-      ],
-      "use_head_wise_attn_gate": true,
-      "sliding_window": 512,
-      "use_moe_router_bias": true,
-      "need_fp32_gate": true,
-      "sink": false,
-      "layer_types": [
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "full_attention",
-        "sliding_attention",
-        "sliding_attention",
-        "sliding_attention"
-      ],
-      "use_rope_layers": [],
-      "partial_rotary_factors": [
-        0.5,
-        1.0,
-        1.0,
-        1.0,
-        0.5,
-        1.0,
-        1.0,
-        1.0,
-        0.5,
-        1.0,
-        1.0,
-        1.0,
-        0.5,
-        1.0,
-        1.0,
-        1.0,
-        0.5,
-        1.0,
-        1.0,
-        1.0,
-        0.5,
-        1.0,
-        1.0,
-        1.0,
-        0.5,
-        1.0,
-        1.0,
-        1.0,
-        0.5,
-        1.0,
-        1.0,
-        1.0,
-        0.5,
-        1.0,
-        1.0,
-        1.0,
-        0.5,
-        1.0,
-        1.0,
-        1.0,
-        0.5,
-        1.0,
-        1.0,
-        1.0,
-        0.5,
-        1.0,
-        1.0,
-        1.0
-      ],
-      "eos_token_id": [
-        1,
-        2,
-        128007
-      ],
-      "bos_token_id": 0,
-      "attention_other_setting": {
-        "attention_type": "sliding_attention",
-        "num_attention_heads": 96,
-        "num_attention_groups": 8,
-        "head_dim": 128,
-        "true_head_dim": 128
-      },
-      "swiglu_limits": [
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        7,
-        7,
-        0.0,
-        0.0,
-        0.0
-      ],
-      "swiglu_limits_shared": [
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        0.0,
-        16,
-        16,
-        0.0,
-        0.0,
-        0.0
-      ]
-    }
   }

 {
+  "architectures": [
+    "Step3p7ForConditionalGeneration"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_step3p7.Step3p7Config",
+    "AutoProcessor": "processing_step3.Step3VLProcessor",
+    "AutoModelForCausalLM": "modeling_step3p7.Step3p7ForConditionalGeneration"
+  },
+  "model_type": "step3p7",
+  "im_end_token": "<im_end>",
+  "im_patch_token": "<im_patch>",
+  "im_start_token": "<im_start>",
+  "image_token_len": 169,
+  "patch_token_len": 81,
+  "image_token_id": 128001,
+  "understand_projector_stride": 2,
+  "use_im_start_end": "true",
+  "vision_select_layer": -1,
+  "projector_bias": false,
+  "vision_config": {
+    "model_type": "perception_encoder",
+    "image_size": 728,
+    "patch_size": 14,
+    "width": 1536,
+    "layers": 47,
+    "heads": 16,
+    "pool_type": "none",
+    "output_dim": null,
+    "use_cls_token": false,
+    "ls_init_value": 0.1,
+    "use_ln_post": false,
+    "hidden_act": "quick_gelu"
+  },
+  "text_config": {
     "architectures": [
+      "Step3p5ForCausalLM"
     ],
+    "rope_scaling": {
+      "rope_type": "llama3",
+      "factor": 2.0,
+      "original_max_position_embeddings": 131072,
+      "low_freq_factor": 1.0,
+      "high_freq_factor": 32.0
     },
+    "yarn_only_types": [
+      "full_attention"
+    ],
+    "model_type": "step3p5",
+    "hidden_size": 4096,
+    "intermediate_size": 11264,
+    "num_hidden_layers": 45,
+    "max_seq_len": 262144,
+    "max_position_embeddings": 262144,
+    "vocab_size": 128896,
+    "torch_dtype": "bfloat16",
+    "use_qk_norm": false,
+    "moe_layers_enum": "3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44",
+    "use_mfa": false,
+    "num_attention_heads": 64,
+    "num_attention_groups": 8,
+    "head_dim": 128,
+    "use_moe": true,
+    "moe_num_experts": 288,
+    "moe_top_k": 8,
+    "moe_intermediate_size": 1280,
+    "share_expert_dim": 1280,
+    "moe_layer_offset": 0,
+    "moe_every_n_layer": 1,
+    "norm_expert_weight": true,
+    "moe_router_activation": "sigmoid",
+    "moe_router_scaling_factor": 3.0,
+    "att_impl_type": "GQA",
+    "num_nextn_predict_layers": 3,
+    "rope_theta": [
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0,
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0,
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0,
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0,
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0,
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0,
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0,
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0,
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0,
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0,
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0,
+      5000000.0,
+      10000.0,
+      10000.0,
+      10000.0
+    ],
+    "use_head_wise_attn_gate": true,
+    "sliding_window": 512,
+    "use_moe_router_bias": true,
+    "need_fp32_gate": true,
+    "sink": false,
+    "layer_types": [
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "full_attention",
+      "sliding_attention",
+      "sliding_attention",
+      "sliding_attention"
+    ],
+    "use_rope_layers": [],
+    "partial_rotary_factors": [
+      0.5,
+      1.0,
+      1.0,
+      1.0,
+      0.5,
+      1.0,
+      1.0,
+      1.0,
+      0.5,
+      1.0,
+      1.0,
+      1.0,
+      0.5,
+      1.0,
+      1.0,
+      1.0,
+      0.5,
+      1.0,
+      1.0,
+      1.0,
+      0.5,
+      1.0,
+      1.0,
+      1.0,
+      0.5,
+      1.0,
+      1.0,
+      1.0,
+      0.5,
+      1.0,
+      1.0,
+      1.0,
+      0.5,
+      1.0,
+      1.0,
+      1.0,
+      0.5,
+      1.0,
+      1.0,
+      1.0,
+      0.5,
+      1.0,
+      1.0,
+      1.0,
+      0.5,
+      1.0,
+      1.0,
+      1.0
+    ],
+    "eos_token_id": [
+      1,
+      2,
+      128007
+    ],
+    "bos_token_id": 0,
+    "attention_other_setting": {
+      "attention_type": "sliding_attention",
+      "num_attention_heads": 96,
       "num_attention_groups": 8,
       "head_dim": 128,
+      "true_head_dim": 128
+    },
+    "swiglu_limits": [
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      7,
+      7,
+      0.0,
+      0.0,
+      0.0
+    ],
+    "swiglu_limits_shared": [
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      16,
+      16,
+      0.0,
+      0.0,
+      0.0
+    ]
   }
+}

configuration_step3p7.py CHANGED Viewed

@@ -91,23 +91,10 @@ class Step3p7TextConfig(PretrainedConfig):
         **kwargs,
     ) -> None:
         torch_dtype = kwargs.get("torch_dtype")
-        layer_types = _normalize_per_layer_values(layer_types,
                                                   num_hidden_layers)
-        swiglu_limits = _normalize_per_layer_values(swiglu_limits,
-                                                    num_hidden_layers)
-        swiglu_limits_shared = _normalize_per_layer_values(
-            swiglu_limits_shared, num_hidden_layers)
-        partial_rotary_factors = kwargs.get("partial_rotary_factors")
-        kwargs["partial_rotary_factors"] = _normalize_per_layer_values(
-            partial_rotary_factors, num_hidden_layers)
-        if isinstance(rope_theta, list):
-            rope_theta = _normalize_per_layer_values(rope_theta,
-                                                     num_hidden_layers)
         if isinstance(rope_scaling, dict):
             rope_scaling = dict(rope_scaling)
-        if use_rope_layers:
-            use_rope_layers = _normalize_per_layer_values(
-                use_rope_layers, num_hidden_layers)
         if share_expert_dim is None:
             share_expert_dim = share_expert_dims
         self.hidden_size = hidden_size
@@ -128,7 +115,7 @@ class Step3p7TextConfig(PretrainedConfig):
         self.head_dim = head_dim
         self.norm_expert_weight = norm_expert_weight
         self.moe_layers_enum = moe_layers_enum
-        self.layer_types = layer_types
         self.sliding_window = sliding_window
         self.pad_token_id = pad_token_id
         self.attention_dropout = attention_dropout
@@ -145,6 +132,7 @@ class Step3p7TextConfig(PretrainedConfig):
         super().__init__(**kwargs)
         if torch_dtype is not None:
             self.torch_dtype = torch_dtype
     def to_dict(self):
         output = super().to_dict()

         **kwargs,
     ) -> None:
         torch_dtype = kwargs.get("torch_dtype")
+        trim_layer_types = _normalize_per_layer_values(layer_types,
                                                   num_hidden_layers)
         if isinstance(rope_scaling, dict):
             rope_scaling = dict(rope_scaling)
         if share_expert_dim is None:
             share_expert_dim = share_expert_dims
         self.hidden_size = hidden_size
         self.head_dim = head_dim
         self.norm_expert_weight = norm_expert_weight
         self.moe_layers_enum = moe_layers_enum
+        self.layer_types = trim_layer_types
         self.sliding_window = sliding_window
         self.pad_token_id = pad_token_id
         self.attention_dropout = attention_dropout
         super().__init__(**kwargs)
         if torch_dtype is not None:
             self.torch_dtype = torch_dtype
+        self.layer_types = layer_types
     def to_dict(self):
         output = super().to_dict()

modeling_step3p7.py CHANGED Viewed

@@ -199,36 +199,40 @@ class Step3p7PreTrainedModel(PreTrainedModel):
 class Step3p7RotaryEmbedding(nn.Module):
     def __init__(self, config: Step3p7TextConfig, device=None, layer_idx=None):
         super().__init__()
-        # BC: "rope_type" was originally "type"
         self.layer_idx = layer_idx
-        self.original_rope_parameters = None
-        if config.rope_parameters is not None:
-            self.original_rope_parameters = config.rope_parameters
-            config.rope_parameters = dict(config.rope_parameters)
-            self.rope_type = config.rope_parameters.get(
-                "rope_type", config.rope_parameters.get("type")
-            )
-        else:
-            self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
-        partial_rotary_factors = getattr(
-            config, "partial_rotary_factors", None
-        )
         if partial_rotary_factors is not None:
-            config.partial_rotary_factor = partial_rotary_factors[self.layer_idx]
-        else:
-            config.partial_rotary_factor = 1.0
-        self.rope_theta = config.rope_theta
-        if isinstance(config.rope_theta, list):
-            self.rope_theta = config.rope_theta.copy()
-            config.rope_theta = self.rope_theta[self.layer_idx]
         self.config = copy.copy(config)
         if config.rope_parameters is not None:
-            self.config.rope_parameters = dict(config.rope_parameters)
         self.rope_init_fn = self.compute_default_rope_parameters
         if self.rope_type != "default":
             self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
@@ -238,8 +242,6 @@ class Step3p7RotaryEmbedding(nn.Module):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
-        config.rope_theta = self.rope_theta
-        config.rope_parameters = self.original_rope_parameters
     @torch.no_grad()
     @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
@@ -288,10 +290,14 @@ class Step3p7RotaryEmbedding(nn.Module):
             post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
         """
         base = config.rope_theta
-        dim = (
             getattr(config, "head_dim", None)
             or config.hidden_size // config.num_attention_heads
         )
         attention_factor = 1.0  # Unused in this type of RoPE
@@ -968,7 +974,6 @@ class Step3p7TextModel(Step3p7TextPreTrainedModel, GenerationMixin):
             mask_kwargs = {
                 "config": self.config,
                 "attention_mask": attention_mask,
-                "cache_position": cache_position,
                 "past_key_values": past_key_values,
                 "position_ids": position_ids,
             }
@@ -1381,7 +1386,12 @@ class Step3p7ForConditionalGeneration(Step3p7PreTrainedModel, GenerationMixin):
             **kwargs,
         )
-        if cache_position[0] == 0:
             # During cached decoding, input ids no longer contain image tokens,
             # so pixel values should only be passed at the first step.
             model_inputs["pixel_values"] = pixel_values
@@ -1392,4 +1402,4 @@ class Step3p7ForConditionalGeneration(Step3p7PreTrainedModel, GenerationMixin):
         if key.startswith("language_model."):
             return key[len("language_model.") :], True
-        return key, False

 class Step3p7RotaryEmbedding(nn.Module):
     def __init__(self, config: Step3p7TextConfig, device=None, layer_idx=None):
         super().__init__()
         self.layer_idx = layer_idx
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
+        rope_theta = config.rope_theta
+        if isinstance(rope_theta, list):
+            rope_theta = rope_theta[0 if layer_idx is None else layer_idx]
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
+        partial_rotary_factors = getattr(config, "partial_rotary_factors", None)
         if partial_rotary_factors is not None:
+            partial_rotary_factor = partial_rotary_factors[
+                0 if layer_idx is None else layer_idx
+            ]
+        self.rope_theta = rope_theta
+        self.partial_rotary_factor = partial_rotary_factor
         self.config = copy.copy(config)
+        self.config.rope_theta = rope_theta
+        self.config.partial_rotary_factor = partial_rotary_factor
         if config.rope_parameters is not None:
+            self.config.rope_parameters = copy.deepcopy(config.rope_parameters)
+            self.config.rope_parameters["rope_theta"] = rope_theta
+            self.config.rope_parameters["partial_rotary_factor"] = (
+                partial_rotary_factor
+            )
+            self.rope_type = self.config.rope_parameters.get(
+                "rope_type", self.config.rope_parameters.get("type")
+            )
+        else:
+            self.rope_type = "default"
         self.rope_init_fn = self.compute_default_rope_parameters
         if self.rope_type != "default":
             self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
     @torch.no_grad()
     @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
             post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
         """
         base = config.rope_theta
+        partial_rotary_factor = getattr(
+            config, "partial_rotary_factor", 1.0
+        )
+        head_dim = (
             getattr(config, "head_dim", None)
             or config.hidden_size // config.num_attention_heads
         )
+        dim = int(head_dim * partial_rotary_factor)
         attention_factor = 1.0  # Unused in this type of RoPE
             mask_kwargs = {
                 "config": self.config,
                 "attention_mask": attention_mask,
                 "past_key_values": past_key_values,
                 "position_ids": position_ids,
             }
             **kwargs,
         )
+        generation_cache_position = model_inputs.get("cache_position", cache_position)
+        is_prefill = past_key_values is None
+        if generation_cache_position is not None and generation_cache_position.numel() > 0:
+            is_prefill = generation_cache_position[0].item() == 0
+        if is_prefill:
             # During cached decoding, input ids no longer contain image tokens,
             # so pixel values should only be passed at the first step.
             model_inputs["pixel_values"] = pixel_values
         if key.startswith("language_model."):
             return key[len("language_model.") :], True
+        return key, False

processing_step3.py CHANGED Viewed

@@ -16,6 +16,7 @@ from torchvision.transforms.functional import InterpolationMode
 from transformers.feature_extraction_utils import BatchFeature, TensorType
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from math import ceil
 from itertools import product
@@ -255,6 +256,16 @@ class Step3VLProcessor(ProcessorMixin):
     attributes = ["tokenizer"]
     tokenizer_class = "AutoTokenizer"
     def __init__(
         self,
         tokenizer=None,

 from transformers.feature_extraction_utils import BatchFeature, TensorType
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_tokenizers import TokenizersBackend
 from math import ceil
 from itertools import product
     attributes = ["tokenizer"]
     tokenizer_class = "AutoTokenizer"
+    @classmethod
+    def _load_tokenizer_from_pretrained(
+        cls, sub_processor_type, pretrained_model_name_or_path, subfolder="", **kwargs
+    ):
+        return TokenizersBackend.from_pretrained(
+            pretrained_model_name_or_path,
+            subfolder=subfolder,
+            **kwargs,
+        )
     def __init__(
         self,
         tokenizer=None,