Text Generation
Transformers
PyTorch
English
shram
research
sparse-attention
mixture-of-experts
custom_code
Instructions to use smithblack-0/SHRAM-dev with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use smithblack-0/SHRAM-dev with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="smithblack-0/SHRAM-dev", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("smithblack-0/SHRAM-dev", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps Settings
- vLLM
How to use smithblack-0/SHRAM-dev with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "smithblack-0/SHRAM-dev" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "smithblack-0/SHRAM-dev", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/smithblack-0/SHRAM-dev
- SGLang
How to use smithblack-0/SHRAM-dev with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "smithblack-0/SHRAM-dev" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "smithblack-0/SHRAM-dev", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "smithblack-0/SHRAM-dev" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "smithblack-0/SHRAM-dev", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use smithblack-0/SHRAM-dev with Docker Model Runner:
docker model run hf.co/smithblack-0/SHRAM-dev
Update architecture and tokenizer
Browse files- huggingface.py +34 -36
huggingface.py
CHANGED
|
@@ -2877,34 +2877,25 @@ class MoSRAHRouter(nn.Module):
|
|
| 2877 |
return mask
|
| 2878 |
|
| 2879 |
@staticmethod
|
| 2880 |
-
def _check_bidding_converged(
|
|
|
|
|
|
|
| 2881 |
"""Raise if the bidding loop exhausted max_rounds without satisfying all tokens.
|
| 2882 |
|
| 2883 |
-
In compiled mode ``torch._check`` fires a C++ assertion
|
| 2884 |
-
(``capture_scalar_outputs=True`` is a precondition — see Unit 19.F.1).
|
| 2885 |
-
In eager mode raises ``RuntimeError`` directly.
|
| 2886 |
-
|
| 2887 |
-
Exhausting ``max_rounds`` indicates an extreme routing density case or an
|
| 2888 |
-
infeasible configuration where total capacity is insufficient for N * K
|
| 2889 |
-
demands. In normal training this should never occur; the default
|
| 2890 |
-
``max_bid_rounds=10`` covers approximately the 98th percentile of routing
|
| 2891 |
-
densities.
|
| 2892 |
-
|
| 2893 |
Args:
|
| 2894 |
-
|
| 2895 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2896 |
"""
|
| 2897 |
-
|
| 2898 |
-
|
| 2899 |
-
|
| 2900 |
-
|
| 2901 |
-
|
| 2902 |
-
|
| 2903 |
-
f"All tokens must have at least K accepted experts before the loop exits. "
|
| 2904 |
-
f"This indicates either an infeasible configuration (total remaining "
|
| 2905 |
-
f"capacity < N * K) or an extreme routing density. "
|
| 2906 |
-
f"Increase mosrah_overallocation_factor or max_bid_rounds."
|
| 2907 |
-
)
|
| 2908 |
|
| 2909 |
@classmethod
|
| 2910 |
def _run_bidding(
|
|
@@ -2984,9 +2975,6 @@ class MoSRAHRouter(nn.Module):
|
|
| 2984 |
proposals, acceptances, _ = torch.while_loop(
|
| 2985 |
cond_fn, body_fn, (proposals, acceptances, round_count),
|
| 2986 |
)
|
| 2987 |
-
|
| 2988 |
-
converged = (acceptances.sum(dim=-1) >= min_choices).all()
|
| 2989 |
-
cls._check_bidding_converged(converged, max_rounds)
|
| 2990 |
return acceptances
|
| 2991 |
|
| 2992 |
@classmethod
|
|
@@ -3069,17 +3057,27 @@ class MoSRAHRouter(nn.Module):
|
|
| 3069 |
# also satisfies the row bound, both constraints hold simultaneously.
|
| 3070 |
# Mask computation runs under no_grad: the boolean mask is a hard routing
|
| 3071 |
# decision and must not accumulate gradient memory through the solver.
|
| 3072 |
-
|
| 3073 |
-
|
| 3074 |
-
|
| 3075 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3076 |
|
| 3077 |
-
# Column-capacity mask violates the row bound: routing is concentrated
|
| 3078 |
-
# enough that per-expert capacity limits leave some tokens with fewer
|
| 3079 |
-
# than min_choices choices. The bidding solver handles this jointly.
|
| 3080 |
with torch.no_grad():
|
| 3081 |
-
|
| 3082 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3083 |
def forward(
|
| 3084 |
self,
|
| 3085 |
x: torch.Tensor,
|
|
|
|
| 2877 |
return mask
|
| 2878 |
|
| 2879 |
@staticmethod
|
| 2880 |
+
def _check_bidding_converged(acceptances: torch.Tensor,
|
| 2881 |
+
min_choices: int,
|
| 2882 |
+
max_rounds: int) -> None:
|
| 2883 |
"""Raise if the bidding loop exhausted max_rounds without satisfying all tokens.
|
| 2884 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2885 |
Args:
|
| 2886 |
+
acceptances: bool tensor of shape (B, N, L) indicating what experts L accepted
|
| 2887 |
+
what tokens.
|
| 2888 |
+
min_choices: Convergence has been reached if acceptances are such that a sum along
|
| 2889 |
+
N always has at least min_choices choices.
|
| 2890 |
+
max_rounds: The iteration ceiling that was applied, for the error message. Used
|
| 2891 |
+
for reporting
|
| 2892 |
"""
|
| 2893 |
+
msg = (
|
| 2894 |
+
f"balance_capacity bidding did not converge within {max_rounds} rounds. "
|
| 2895 |
+
f"Increase mosrah_overallocation_factor or max_bid_rounds."
|
| 2896 |
+
)
|
| 2897 |
+
converged = (acceptances.sum(dim=-1) >= min_choices).all()
|
| 2898 |
+
torch._assert_async(converged, msg)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2899 |
|
| 2900 |
@classmethod
|
| 2901 |
def _run_bidding(
|
|
|
|
| 2975 |
proposals, acceptances, _ = torch.while_loop(
|
| 2976 |
cond_fn, body_fn, (proposals, acceptances, round_count),
|
| 2977 |
)
|
|
|
|
|
|
|
|
|
|
| 2978 |
return acceptances
|
| 2979 |
|
| 2980 |
@classmethod
|
|
|
|
| 3057 |
# also satisfies the row bound, both constraints hold simultaneously.
|
| 3058 |
# Mask computation runs under no_grad: the boolean mask is a hard routing
|
| 3059 |
# decision and must not accumulate gradient memory through the solver.
|
| 3060 |
+
def skip(mask: torch.Tensor, logits: torch.Tensor)->torch.Tensor:
|
| 3061 |
+
"""Skip bidding on the mask"""
|
| 3062 |
+
return mask.clone()
|
| 3063 |
+
|
| 3064 |
+
def resolve_mask(mask: torch.Tensor, logits: torch.Tensor) -> torch.Tensor:
|
| 3065 |
+
"""Execute full bidding process"""
|
| 3066 |
+
return cls._run_bidding(logits,
|
| 3067 |
+
remaining_capacity,
|
| 3068 |
+
min_choices,
|
| 3069 |
+
max_rounds,
|
| 3070 |
+
capacity)
|
| 3071 |
|
|
|
|
|
|
|
|
|
|
| 3072 |
with torch.no_grad():
|
| 3073 |
+
col_capacity_mask = cls.get_mask(logits,
|
| 3074 |
+
dim=-2,
|
| 3075 |
+
n=remaining_capacity,
|
| 3076 |
+
capacity_scalar=capacity)
|
| 3077 |
+
mask_sufficient = (col_capacity_mask.sum(dim=-1) >= min_choices).all()
|
| 3078 |
+
final_mask = torch.cond(mask_sufficient, skip, resolve_mask, [col_capacity_mask, logits])
|
| 3079 |
+
cls._check_bidding_converged(final_mask, min_choices, max_rounds)
|
| 3080 |
+
return logits.masked_fill(~final_mask, mask_value)
|
| 3081 |
def forward(
|
| 3082 |
self,
|
| 3083 |
x: torch.Tensor,
|