update readme to show the combine moe version
Browse files
README.md
CHANGED
|
@@ -49,6 +49,49 @@ The official version of DeepSeek-OCR has limited the transformers version to 4.4
|
|
| 49 |
|
| 50 |
Feel free to opt for various attention implementations such as Flash Attention or SDPA to leverage the latest optimizations in transformers for a performance boost.
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
## MindSpore Usage
|
| 53 |
Inference using Huggingface transformers on Ascend NPUs. Requirements tested on MindSpore2.7+ CANN8.2:
|
| 54 |
|
|
@@ -74,6 +117,9 @@ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
|
| 74 |
model = AutoModel.from_pretrained(model_name, dtype=mindspore.float16, _attn_implementation='sdpa', trust_remote_code=True, use_safetensors=True, device_map='auto')
|
| 75 |
model = model.eval()
|
| 76 |
|
|
|
|
|
|
|
|
|
|
| 77 |
# prompt = "<image>\nFree OCR. "
|
| 78 |
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
|
| 79 |
image_file = 'your_image.jpg'
|
|
@@ -114,6 +160,10 @@ model_name = 'lvyufeng/DeepSeek-OCR'
|
|
| 114 |
tokenizer = AutoTokenizer.from_pretrained(model_name, dtype=torch.bfloat16,trust_remote_code=True, device_map='auto')
|
| 115 |
model = AutoModel.from_pretrained(model_name, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True)
|
| 116 |
model = model.eval()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
# prompt = "<image>\nFree OCR. "
|
| 118 |
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
|
| 119 |
image_file = 'your_image.jpg'
|
|
|
|
| 49 |
|
| 50 |
Feel free to opt for various attention implementations such as Flash Attention or SDPA to leverage the latest optimizations in transformers for a performance boost.
|
| 51 |
|
| 52 |
+
## Combined MoE
|
| 53 |
+
|
| 54 |
+
In Transformer-based Mixture-of-Experts (MoE) models, the conventional approach relies on an MoE gating module to select experts, followed by processing hidden states through iterative loops. This often results in host-bound comparisons, which can significantly slow down token generation—especially on Ascend hardware.
|
| 55 |
+
|
| 56 |
+
To address this, we introduce a method that consolidates the MoE layer into three unified weight matrices (up, down, and gate_proj). This design is particularly suitable for smaller MoE models that can be fully loaded into memory. Below is the key implementation:
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
```python
|
| 60 |
+
# combine weights of expert befor inference:
|
| 61 |
+
for layer in self.model.layers:
|
| 62 |
+
if isinstance(layer.mlp, DeepseekV2MoE):
|
| 63 |
+
moe_layer = layer.mlp
|
| 64 |
+
# combine experts
|
| 65 |
+
moe_layer.w1 = nn.Parameter(torch.stack([moe_layer.experts[i].gate_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]), requires_grad=False)
|
| 66 |
+
moe_layer.w2 = nn.Parameter(torch.stack([moe_layer.experts[i].down_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]), requires_grad=False)
|
| 67 |
+
moe_layer.w3 = nn.Parameter(torch.stack([moe_layer.experts[i].up_proj.weight.T for i in range(moe_layer.config.n_routed_experts)]), requires_grad=False)
|
| 68 |
+
|
| 69 |
+
# patch the new forward method of DeepseekV2MoE
|
| 70 |
+
|
| 71 |
+
def new_forward_for_moe(self, hidden_states):
|
| 72 |
+
batch_size, sequence_length, hidden_dim = hidden_states.shape
|
| 73 |
+
selected_experts, routing_weights = self.gate(hidden_states)
|
| 74 |
+
router_scores = torch.zeros(size=(batch_size * sequence_length, self.config.n_routed_experts), device=hidden_states.device, dtype=hidden_states.dtype)
|
| 75 |
+
# we cast back to the input dtype
|
| 76 |
+
routing_weights = routing_weights.to(hidden_states.dtype)
|
| 77 |
+
router_scores = torch.scatter_add(router_scores, -1, selected_experts, routing_weights)
|
| 78 |
+
hidden_states = hidden_states.view(-1, hidden_dim)
|
| 79 |
+
if self.config.n_shared_experts is not None:
|
| 80 |
+
shared_expert_output = self.shared_experts(hidden_states)
|
| 81 |
+
|
| 82 |
+
hidden_w1 = torch.matmul(hidden_states, self.w1)
|
| 83 |
+
hidden_w3 = torch.matmul(hidden_states, self.w3)
|
| 84 |
+
hidden_states = self.act(hidden_w1) * hidden_w3
|
| 85 |
+
hidden_states = torch.bmm(hidden_states, self.w2) * torch.transpose(router_scores, 0, 1).unsqueeze(-1)
|
| 86 |
+
final_hidden_states = hidden_states.sum(dim=0, dtype=hidden_states.dtype)
|
| 87 |
+
if self.config.n_shared_experts is not None:
|
| 88 |
+
hidden_states = final_hidden_states + shared_expert_output
|
| 89 |
+
return hidden_states.view(batch_size, sequence_length, hidden_dim)
|
| 90 |
+
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
As a result, we achieve a 3–4x speedup in OCR text generation. This dramatic improvement makes the optimized model a game-changer for production environments.
|
| 94 |
+
|
| 95 |
## MindSpore Usage
|
| 96 |
Inference using Huggingface transformers on Ascend NPUs. Requirements tested on MindSpore2.7+ CANN8.2:
|
| 97 |
|
|
|
|
| 117 |
model = AutoModel.from_pretrained(model_name, dtype=mindspore.float16, _attn_implementation='sdpa', trust_remote_code=True, use_safetensors=True, device_map='auto')
|
| 118 |
model = model.eval()
|
| 119 |
|
| 120 |
+
# combine experts
|
| 121 |
+
model.combine_moe()
|
| 122 |
+
|
| 123 |
# prompt = "<image>\nFree OCR. "
|
| 124 |
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
|
| 125 |
image_file = 'your_image.jpg'
|
|
|
|
| 160 |
tokenizer = AutoTokenizer.from_pretrained(model_name, dtype=torch.bfloat16,trust_remote_code=True, device_map='auto')
|
| 161 |
model = AutoModel.from_pretrained(model_name, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True)
|
| 162 |
model = model.eval()
|
| 163 |
+
|
| 164 |
+
# combine experts
|
| 165 |
+
model.combine_moe()
|
| 166 |
+
|
| 167 |
# prompt = "<image>\nFree OCR. "
|
| 168 |
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
|
| 169 |
image_file = 'your_image.jpg'
|