Instructions to use stepfun-ai/Step-3.7-Flash-FP8 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.

Libraries

How to use stepfun-ai/Step-3.7-Flash-FP8 with Transformers:

# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("image-text-to-text", model="stepfun-ai/Step-3.7-Flash-FP8", trust_remote_code=True)
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
            {"type": "text", "text": "What animal is on the candy?"}
        ]
    },
]
pipe(text=messages)

# Load model directly
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained("stepfun-ai/Step-3.7-Flash-FP8", trust_remote_code=True, dtype="auto")

Notebooks
Google Colab
Kaggle
Local Apps

vLLM

How to use stepfun-ai/Step-3.7-Flash-FP8 with vLLM:

Install from pip and serve model

# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "stepfun-ai/Step-3.7-Flash-FP8"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "stepfun-ai/Step-3.7-Flash-FP8",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker

docker model run hf.co/stepfun-ai/Step-3.7-Flash-FP8

SGLang

How to use stepfun-ai/Step-3.7-Flash-FP8 with SGLang:

Install from pip and serve model

# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
    --model-path "stepfun-ai/Step-3.7-Flash-FP8" \
    --host 0.0.0.0 \
    --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "stepfun-ai/Step-3.7-Flash-FP8",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Use Docker images

docker run --gpus all \
    --shm-size 32g \
    -p 30000:30000 \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    --env "HF_TOKEN=<secret>" \
    --ipc=host \
    lmsysorg/sglang:latest \
    python3 -m sglang.launch_server \
        --model-path "stepfun-ai/Step-3.7-Flash-FP8" \
        --host 0.0.0.0 \
        --port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
	-H "Content-Type: application/json" \
	--data '{
		"model": "stepfun-ai/Step-3.7-Flash-FP8",
		"messages": [
			{
				"role": "user",
				"content": [
					{
						"type": "text",
						"text": "Describe this image in one sentence."
					},
					{
						"type": "image_url",
						"image_url": {
							"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
						}
					}
				]
			}
		]
	}'

Docker Model Runner
How to use stepfun-ai/Step-3.7-Flash-FP8 with Docker Model Runner:
```
docker model run hf.co/stepfun-ai/Step-3.7-Flash-FP8
```

luotingdan commited on 3 days ago

Commit

456ec15

1 Parent(s): 77ddf22

update processor config and support transformers 5.0+

Browse files

Files changed (4) hide show

config.json +2 -1
configuration_step3p7.py +4 -16
modeling_step3p7.py +37 -27
processing_step3.py +11 -0

config.json CHANGED Viewed

@@ -4,6 +4,7 @@
   ],
   "auto_map": {
     "AutoConfig": "configuration_step3p7.Step3p7Config",
     "AutoModelForCausalLM": "modeling_step3p7.Step3p7ForConditionalGeneration"
   },
   "model_type": "step3p7",
@@ -1213,4 +1214,4 @@
       "vit_large_projector"
     ]
   }
-}

   ],
   "auto_map": {
     "AutoConfig": "configuration_step3p7.Step3p7Config",
+    "AutoProcessor": "processing_step3.Step3VLProcessor",
     "AutoModelForCausalLM": "modeling_step3p7.Step3p7ForConditionalGeneration"
   },
   "model_type": "step3p7",
       "vit_large_projector"
     ]
   }
+}

configuration_step3p7.py CHANGED Viewed

@@ -91,23 +91,10 @@ class Step3p7TextConfig(PretrainedConfig):
         **kwargs,
     ) -> None:
         torch_dtype = kwargs.get("torch_dtype")
-        layer_types = _normalize_per_layer_values(layer_types,
                                                   num_hidden_layers)
-        swiglu_limits = _normalize_per_layer_values(swiglu_limits,
-                                                    num_hidden_layers)
-        swiglu_limits_shared = _normalize_per_layer_values(
-            swiglu_limits_shared, num_hidden_layers)
-        partial_rotary_factors = kwargs.get("partial_rotary_factors")
-        kwargs["partial_rotary_factors"] = _normalize_per_layer_values(
-            partial_rotary_factors, num_hidden_layers)
-        if isinstance(rope_theta, list):
-            rope_theta = _normalize_per_layer_values(rope_theta,
-                                                     num_hidden_layers)
         if isinstance(rope_scaling, dict):
             rope_scaling = dict(rope_scaling)
-        if use_rope_layers:
-            use_rope_layers = _normalize_per_layer_values(
-                use_rope_layers, num_hidden_layers)
         if share_expert_dim is None:
             share_expert_dim = share_expert_dims
         self.hidden_size = hidden_size
@@ -128,7 +115,7 @@ class Step3p7TextConfig(PretrainedConfig):
         self.head_dim = head_dim
         self.norm_expert_weight = norm_expert_weight
         self.moe_layers_enum = moe_layers_enum
-        self.layer_types = layer_types
         self.sliding_window = sliding_window
         self.pad_token_id = pad_token_id
         self.attention_dropout = attention_dropout
@@ -145,6 +132,7 @@ class Step3p7TextConfig(PretrainedConfig):
         super().__init__(**kwargs)
         if torch_dtype is not None:
             self.torch_dtype = torch_dtype
     def to_dict(self):
         output = super().to_dict()
@@ -216,4 +204,4 @@ class Step3p7Config(PretrainedConfig):
         self.max_position_embeddings = text_config.max_position_embeddings
         self.image_token_id = image_token_id
         # Help Auto classes find the correct implementation when saving/loading.
-        super().__init__(**kwargs)

         **kwargs,
     ) -> None:
         torch_dtype = kwargs.get("torch_dtype")
+        trim_layer_types = _normalize_per_layer_values(layer_types,
                                                   num_hidden_layers)
         if isinstance(rope_scaling, dict):
             rope_scaling = dict(rope_scaling)
         if share_expert_dim is None:
             share_expert_dim = share_expert_dims
         self.hidden_size = hidden_size
         self.head_dim = head_dim
         self.norm_expert_weight = norm_expert_weight
         self.moe_layers_enum = moe_layers_enum
+        self.layer_types = trim_layer_types
         self.sliding_window = sliding_window
         self.pad_token_id = pad_token_id
         self.attention_dropout = attention_dropout
         super().__init__(**kwargs)
         if torch_dtype is not None:
             self.torch_dtype = torch_dtype
+        self.layer_types = layer_types
     def to_dict(self):
         output = super().to_dict()
         self.max_position_embeddings = text_config.max_position_embeddings
         self.image_token_id = image_token_id
         # Help Auto classes find the correct implementation when saving/loading.
+        super().__init__(**kwargs)

modeling_step3p7.py CHANGED Viewed

@@ -199,36 +199,40 @@ class Step3p7PreTrainedModel(PreTrainedModel):
 class Step3p7RotaryEmbedding(nn.Module):
     def __init__(self, config: Step3p7TextConfig, device=None, layer_idx=None):
         super().__init__()
-        # BC: "rope_type" was originally "type"
         self.layer_idx = layer_idx
-        self.original_rope_parameters = None
-        if config.rope_parameters is not None:
-            self.original_rope_parameters = config.rope_parameters
-            config.rope_parameters = dict(config.rope_parameters)
-            self.rope_type = config.rope_parameters.get(
-                "rope_type", config.rope_parameters.get("type")
-            )
-        else:
-            self.rope_type = "default"
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
-        partial_rotary_factors = getattr(
-            config, "partial_rotary_factors", None
-        )
         if partial_rotary_factors is not None:
-            config.partial_rotary_factor = partial_rotary_factors[self.layer_idx]
-        else:
-            config.partial_rotary_factor = 1.0
-        self.rope_theta = config.rope_theta
-        if isinstance(config.rope_theta, list):
-            self.rope_theta = config.rope_theta.copy()
-            config.rope_theta = self.rope_theta[self.layer_idx]
         self.config = copy.copy(config)
         if config.rope_parameters is not None:
-            self.config.rope_parameters = dict(config.rope_parameters)
         self.rope_init_fn = self.compute_default_rope_parameters
         if self.rope_type != "default":
             self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
@@ -238,8 +242,6 @@ class Step3p7RotaryEmbedding(nn.Module):
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
-        config.rope_theta = self.rope_theta
-        config.rope_parameters = self.original_rope_parameters
     @torch.no_grad()
     @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
@@ -288,10 +290,14 @@ class Step3p7RotaryEmbedding(nn.Module):
             post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
         """
         base = config.rope_theta
-        dim = (
             getattr(config, "head_dim", None)
             or config.hidden_size // config.num_attention_heads
         )
         attention_factor = 1.0  # Unused in this type of RoPE
@@ -968,7 +974,6 @@ class Step3p7TextModel(Step3p7TextPreTrainedModel, GenerationMixin):
             mask_kwargs = {
                 "config": self.config,
                 "attention_mask": attention_mask,
-                "cache_position": cache_position,
                 "past_key_values": past_key_values,
                 "position_ids": position_ids,
             }
@@ -1381,7 +1386,12 @@ class Step3p7ForConditionalGeneration(Step3p7PreTrainedModel, GenerationMixin):
             **kwargs,
         )
-        if cache_position[0] == 0:
             # During cached decoding, input ids no longer contain image tokens,
             # so pixel values should only be passed at the first step.
             model_inputs["pixel_values"] = pixel_values
@@ -1392,4 +1402,4 @@ class Step3p7ForConditionalGeneration(Step3p7PreTrainedModel, GenerationMixin):
         if key.startswith("language_model."):
             return key[len("language_model.") :], True
-        return key, False

 class Step3p7RotaryEmbedding(nn.Module):
     def __init__(self, config: Step3p7TextConfig, device=None, layer_idx=None):
         super().__init__()
         self.layer_idx = layer_idx
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
+        rope_theta = config.rope_theta
+        if isinstance(rope_theta, list):
+            rope_theta = rope_theta[0 if layer_idx is None else layer_idx]
+        partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0)
+        partial_rotary_factors = getattr(config, "partial_rotary_factors", None)
         if partial_rotary_factors is not None:
+            partial_rotary_factor = partial_rotary_factors[
+                0 if layer_idx is None else layer_idx
+            ]
+        self.rope_theta = rope_theta
+        self.partial_rotary_factor = partial_rotary_factor
         self.config = copy.copy(config)
+        self.config.rope_theta = rope_theta
+        self.config.partial_rotary_factor = partial_rotary_factor
         if config.rope_parameters is not None:
+            self.config.rope_parameters = copy.deepcopy(config.rope_parameters)
+            self.config.rope_parameters["rope_theta"] = rope_theta
+            self.config.rope_parameters["partial_rotary_factor"] = (
+                partial_rotary_factor
+            )
+            self.rope_type = self.config.rope_parameters.get(
+                "rope_type", self.config.rope_parameters.get("type")
+            )
+        else:
+            self.rope_type = "default"
         self.rope_init_fn = self.compute_default_rope_parameters
         if self.rope_type != "default":
             self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
         self.register_buffer("inv_freq", inv_freq, persistent=False)
         self.original_inv_freq = self.inv_freq
     @torch.no_grad()
     @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
             post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
         """
         base = config.rope_theta
+        partial_rotary_factor = getattr(
+            config, "partial_rotary_factor", 1.0
+        )
+        head_dim = (
             getattr(config, "head_dim", None)
             or config.hidden_size // config.num_attention_heads
         )
+        dim = int(head_dim * partial_rotary_factor)
         attention_factor = 1.0  # Unused in this type of RoPE
             mask_kwargs = {
                 "config": self.config,
                 "attention_mask": attention_mask,
                 "past_key_values": past_key_values,
                 "position_ids": position_ids,
             }
             **kwargs,
         )
+        generation_cache_position = model_inputs.get("cache_position", cache_position)
+        is_prefill = past_key_values is None
+        if generation_cache_position is not None and generation_cache_position.numel() > 0:
+            is_prefill = generation_cache_position[0].item() == 0
+        if is_prefill:
             # During cached decoding, input ids no longer contain image tokens,
             # so pixel values should only be passed at the first step.
             model_inputs["pixel_values"] = pixel_values
         if key.startswith("language_model."):
             return key[len("language_model.") :], True
+        return key, False

processing_step3.py CHANGED Viewed

@@ -16,6 +16,7 @@ from torchvision.transforms.functional import InterpolationMode
 from transformers.feature_extraction_utils import BatchFeature, TensorType
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
 from math import ceil
 from itertools import product
@@ -255,6 +256,16 @@ class Step3VLProcessor(ProcessorMixin):
     attributes = ["tokenizer"]
     tokenizer_class = "AutoTokenizer"
     def __init__(
         self,
         tokenizer=None,

 from transformers.feature_extraction_utils import BatchFeature, TensorType
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_tokenizers import TokenizersBackend
 from math import ceil
 from itertools import product
     attributes = ["tokenizer"]
     tokenizer_class = "AutoTokenizer"
+    @classmethod
+    def _load_tokenizer_from_pretrained(
+        cls, sub_processor_type, pretrained_model_name_or_path, subfolder="", **kwargs
+    ):
+        return TokenizersBackend.from_pretrained(
+            pretrained_model_name_or_path,
+            subfolder=subfolder,
+            **kwargs,
+        )
     def __init__(
         self,
         tokenizer=None,