shanjiaz commited on
Commit
a8bbe39
·
verified ·
1 Parent(s): d299c18

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,5 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- blobs/6460cb80c6bcd981a75868e609bd77e1597421e9f2c37be887e513fee4ca65ec filter=lfs diff=lfs merge=lfs -text
37
- blobs/b51a86299f0279039b3d495ee70e4870ca7af3b9621ed93c1559ec659da79e60 filter=lfs diff=lfs merge=lfs -text
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "PEagleDraftModel"
4
+ ],
5
+ "auto_map": {
6
+ "": "config.PEagleSpeculatorConfig"
7
+ },
8
+ "down_sample_ratio": 0.7,
9
+ "down_sample_ratio_min": 0.2,
10
+ "draft_vocab_size": 151936,
11
+ "dtype": "bfloat16",
12
+ "eagle_aux_hidden_state_layer_ids": [
13
+ 2,
14
+ 18,
15
+ 33
16
+ ],
17
+ "embed_requires_grad": true,
18
+ "mask_token_id": 151669,
19
+ "max_seq_len": 8192,
20
+ "norm_before_fc": false,
21
+ "norm_before_residual": false,
22
+ "num_depths": 4,
23
+ "speculators_config": {
24
+ "algorithm": "peagle",
25
+ "default_proposal_method": "greedy",
26
+ "proposal_methods": [
27
+ {
28
+ "accept_tolerance": 0.0,
29
+ "proposal_type": "greedy",
30
+ "speculative_tokens": 4,
31
+ "verifier_accept_k": 1
32
+ }
33
+ ],
34
+ "verifier": {
35
+ "architectures": [],
36
+ "name_or_path": "Qwen/Qwen3-8B"
37
+ }
38
+ },
39
+ "speculators_model_type": "peagle",
40
+ "speculators_version": "0.5.0.dev58",
41
+ "target_hidden_size": null,
42
+ "tie_word_embeddings": false,
43
+ "transformer_layer_config": {
44
+ "attention_bias": false,
45
+ "attention_dropout": 0.0,
46
+ "bos_token_id": 1,
47
+ "eos_token_id": 2,
48
+ "head_dim": 128,
49
+ "hidden_act": "silu",
50
+ "hidden_size": 4096,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 12288,
53
+ "max_position_embeddings": 40960,
54
+ "mlp_bias": false,
55
+ "model_type": "llama",
56
+ "num_attention_heads": 32,
57
+ "num_hidden_layers": 4,
58
+ "num_key_value_heads": 8,
59
+ "pad_token_id": null,
60
+ "pretraining_tp": 1,
61
+ "rms_norm_eps": 1e-06,
62
+ "rope_parameters": {
63
+ "rope_theta": 10000.0,
64
+ "rope_type": "default"
65
+ },
66
+ "tie_word_embeddings": false,
67
+ "use_cache": true,
68
+ "vocab_size": 151936
69
+ },
70
+ "transformers_version": "5.6.0"
71
+ }
config.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Literal
2
+
3
+ from pydantic import Field
4
+
5
+ from speculators import SpeculatorModelConfig
6
+ from speculators.models.eagle3.config import Eagle3SpeculatorConfig
7
+
8
+ __all__ = [
9
+ "PEagleSpeculatorConfig",
10
+ ]
11
+
12
+
13
+ @SpeculatorModelConfig.register("peagle")
14
+ class PEagleSpeculatorConfig(Eagle3SpeculatorConfig):
15
+ """
16
+ Configuration for P-EAGLE (Parallel EAGLE) speculator.
17
+
18
+ P-EAGLE extends EAGLE-3 with parallel multi-token prediction using
19
+ Conditional Drop Token (COD) sampling for memory-efficient training.
20
+
21
+ :param para_depths: Number of parallel prediction groups (typically 8)
22
+ :param down_sample_ratio: Geometric decay ratio for COD sampling (r in [0,1])
23
+ :param down_sample_ratio_min: Minimum retention ratio floor
24
+ :param mask_token_id: Token ID used for masking
25
+ :param max_seq_len: Maximum sequence length for attention mask construction
26
+ """
27
+
28
+ speculators_model_type: Literal["peagle"] = "peagle" # type: ignore[assignment]
29
+ architectures: list[str] = Field(
30
+ default_factory=lambda: ["PEagleSpeculator"],
31
+ description="Model architectures that can load these weights",
32
+ )
33
+
34
+ para_depths: int = Field(
35
+ default=8,
36
+ description="Number of parallel prediction groups (depths)",
37
+ ge=1,
38
+ le=16,
39
+ )
40
+
41
+ down_sample_ratio: float = Field(
42
+ default=0.7,
43
+ description="Geometric decay ratio for COD sampling (retention rate r)",
44
+ gt=0.0,
45
+ lt=1.0,
46
+ )
47
+
48
+ down_sample_ratio_min: float = Field(
49
+ default=0.1,
50
+ description="Minimum retention ratio floor to prevent over-sampling",
51
+ gt=0.0,
52
+ le=1.0,
53
+ )
54
+
55
+ mask_token_id: int | None = Field(
56
+ default=None,
57
+ description="Token ID used for padding unused positions in parallel groups",
58
+ )
59
+
60
+ max_seq_len: int = Field(
61
+ default=2048,
62
+ description="Maximum sequence length for attention mask construction",
63
+ ge=128,
64
+ le=8192,
65
+ )
66
+
67
+ # Override Eagle3 default: P-EAGLE requires trainable embeddings
68
+ # (matches p-eagle-train)
69
+ embed_requires_grad: bool = Field(
70
+ default=True,
71
+ description=(
72
+ "Whether embedding layer weights require gradients during "
73
+ "training (True for P-EAGLE)"
74
+ ),
75
+ )
76
+
77
+ prediction_loss_weight: float = Field(
78
+ default=1.0,
79
+ description="Weight for prediction loss (cross-entropy on logits). "
80
+ "P-eagle-train uses only prediction loss, no hidden state distillation.",
81
+ gt=0.0,
82
+ )
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6460cb80c6bcd981a75868e609bd77e1597421e9f2c37be887e513fee4ca65ec
3
+ size 4183929280
optimizer_state_dict.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b51a86299f0279039b3d495ee70e4870ca7af3b9621ed93c1559ec659da79e60
3
+ size 5878569834
scheduler_state_dict.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e65c261ecf76be2b856fa1d5b0f02d0e9bf164284775a68082ac9ebe1b6a7b7f
3
+ size 1531
val_metrics.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"loss_epoch": 2.2505897877767778, "full_acc_epoch": 0.4788787332964311, "position 0 acc_epoch": 0.6671951389062811, "position 1 acc_epoch": 0.4501549169114997, "position 2 acc_epoch": 0.31645943323750436, "position 3 acc_epoch": 0.22150448855365093}