szxllm commited on
Commit
3d1c312
·
verified ·
1 Parent(s): 9223e06

Update peft_.py

Browse files
Files changed (1) hide show
  1. peft_.py +208 -212
peft_.py CHANGED
@@ -1,213 +1,209 @@
1
- """
2
- 参数高效微调 (PEFT) 模块
3
- 支持LoRA和Adapter
4
- """
5
- import torch
6
- import torch.nn as nn
7
- import math
8
-
9
- class LoRALayer(nn.Module):
10
- """低秩适应层 (LoRA)"""
11
- def __init__(
12
- self,
13
- in_features: int,
14
- out_features: int,
15
- rank: int = 8,
16
- alpha: float = 16.0,
17
- dropout: float = 0.0
18
- ):
19
- super().__init__()
20
- self.rank = rank
21
- self.alpha = alpha
22
- self.scaling = alpha / rank
23
-
24
- self.lora_A = nn.Parameter(torch.zeros(in_features, rank))
25
- self.lora_B = nn.Parameter(torch.zeros(rank, out_features))
26
-
27
- self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
28
-
29
- nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
30
- nn.init.zeros_(self.lora_B)
31
-
32
- self.merged = False
33
-
34
- def forward(self, x: torch.Tensor) -> torch.Tensor:
35
- """前向传播"""
36
- result = x @ self.lora_A @ self.lora_B
37
- result = self.dropout(result)
38
- return result * self.scaling
39
-
40
- class LinearWithLoRA(nn.Module):
41
- """带LoRA的线性层"""
42
- def __init__(
43
- self,
44
- in_features: int,
45
- out_features: int,
46
- bias: bool = True,
47
- use_lora: bool = False,
48
- lora_rank: int = 8,
49
- lora_alpha: float = 16.0,
50
- lora_dropout: float = 0.0
51
- ):
52
- super().__init__()
53
- self.in_features = in_features
54
- self.out_features = out_features
55
- self.use_lora = use_lora
56
-
57
- self.base_linear = nn.Linear(in_features, out_features, bias=bias)
58
-
59
- if use_lora:
60
- self.lora = LoRALayer(
61
- in_features,
62
- out_features,
63
- lora_rank,
64
- lora_alpha,
65
- lora_dropout
66
- )
67
- self.merged = False
68
- else:
69
- self.lora = None
70
- self.merged = False
71
-
72
- def merge(self):
73
- """将LoRA权重合并到基础权重中"""
74
- if self.use_lora and not self.merged:
75
- lora_weight = (self.lora.lora_A @ self.lora.lora_B) * self.lora.scaling
76
- self.base_linear.weight.data += lora_weight.T
77
- self.merged = True
78
-
79
- def unmerge(self):
80
- """取消合并LoRA权重"""
81
- if self.use_lora and self.merged:
82
- lora_weight = (self.lora.lora_A @ self.lora.lora_B) * self.lora.scaling
83
- self.base_linear.weight.data -= lora_weight.T
84
- self.merged = False
85
-
86
- def forward(self, x: torch.Tensor) -> torch.Tensor:
87
- """前向传播"""
88
- output = self.base_linear(x)
89
-
90
- if self.use_lora and self.lora is not None and not self.merged:
91
- output = output + self.lora(x)
92
-
93
- return output
94
-
95
- class AdapterLayer(nn.Module):
96
- """Adapter层 - 轻量级微调"""
97
- def __init__(
98
- self,
99
- dim: int,
100
- bottleneck_dim: int = 64,
101
- dropout: float = 0.1,
102
- activation: str = 'gelu',
103
- residual_scale: float = 1.0
104
- ):
105
- super().__init__()
106
- self.residual_scale = residual_scale
107
-
108
- self.down_proj = nn.Linear(dim, bottleneck_dim)
109
-
110
- if activation == 'gelu':
111
- self.activation = nn.GELU()
112
- elif activation == 'relu':
113
- self.activation = nn.ReLU()
114
- elif activation == 'silu':
115
- self.activation = nn.SiLU()
116
- else:
117
- self.activation = nn.GELU()
118
-
119
- self.up_proj = nn.Linear(bottleneck_dim, dim)
120
- self.dropout = nn.Dropout(dropout)
121
-
122
- from components import RMSNorm
123
- self.layer_norm = RMSNorm(dim)
124
-
125
- self._init_weights()
126
-
127
- def _init_weights(self):
128
- """初始化权重"""
129
- nn.init.kaiming_uniform_(self.down_proj.weight, a=math.sqrt(5))
130
- nn.init.zeros_(self.up_proj.weight)
131
- if self.down_proj.bias is not None:
132
- nn.init.zeros_(self.down_proj.bias)
133
- if self.up_proj.bias is not None:
134
- nn.init.zeros_(self.up_proj.bias)
135
-
136
- def forward(self, x: torch.Tensor) -> torch.Tensor:
137
- """前向传播"""
138
- residual = x
139
-
140
- x = self.layer_norm(x)
141
- x = self.down_proj(x)
142
- x = self.activation(x)
143
- x = self.dropout(x)
144
- x = self.up_proj(x)
145
- x = self.dropout(x)
146
-
147
- return residual + x * self.residual_scale
148
-
149
- class PrefixTuning(nn.Module):
150
- """Prefix Tuning"""
151
- def __init__(
152
- self,
153
- num_layers: int,
154
- num_tokens: int,
155
- dim: int,
156
- num_heads: int
157
- ):
158
- super().__init__()
159
- self.num_layers = num_layers
160
- self.num_tokens = num_tokens
161
- self.dim = dim
162
- self.num_heads = num_heads
163
-
164
- head_dim = dim // num_heads
165
- self.prefix = nn.Parameter(
166
- torch.randn(num_layers, 2, num_tokens, num_heads, head_dim)
167
- )
168
-
169
- nn.init.normal_(self.prefix, std=0.02)
170
-
171
- def forward(self, layer_idx: int, batch_size: int) -> torch.Tensor:
172
- """获取指定层的prefix"""
173
- prefix = self.prefix[layer_idx]
174
- prefix = prefix.unsqueeze(1).expand(
175
- 2, batch_size, self.num_heads, self.num_tokens, -1
176
- )
177
-
178
- return prefix
179
-
180
- class PromptTuning(nn.Module):
181
- """Prompt Tuning"""
182
- def __init__(
183
- self,
184
- num_tokens: int,
185
- dim: int,
186
- init_from_vocab: bool = False,
187
- vocab_embeddings: nn.Embedding = None
188
- ):
189
- super().__init__()
190
- self.num_tokens = num_tokens
191
- self.dim = dim
192
-
193
- self.prompt_embeddings = nn.Parameter(torch.randn(num_tokens, dim))
194
-
195
- if init_from_vocab and vocab_embeddings is not None:
196
- indices = torch.randint(0, vocab_embeddings.num_embeddings, (num_tokens,))
197
- self.prompt_embeddings.data = vocab_embeddings.weight[indices].clone()
198
- else:
199
- nn.init.normal_(self.prompt_embeddings, std=0.02)
200
-
201
- def forward(self, batch_size: int) -> torch.Tensor:
202
- """获取prompt embeddings"""
203
- return self.prompt_embeddings.unsqueeze(0).expand(batch_size, -1, -1)
204
-
205
- class IALayer(nn.Module):
206
- """(IA)³层"""
207
- def __init__(self, dim: int):
208
- super().__init__()
209
- self.scale = nn.Parameter(torch.ones(dim))
210
-
211
- def forward(self, x: torch.Tensor) -> torch.Tensor:
212
- """应用缩放"""
213
  return x * self.scale
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import math
4
+
5
+ class LoRALayer(nn.Module):
6
+ """低秩适应层 (LoRA)"""
7
+ def __init__(
8
+ self,
9
+ in_features: int,
10
+ out_features: int,
11
+ rank: int = 8,
12
+ alpha: float = 16.0,
13
+ dropout: float = 0.0
14
+ ):
15
+ super().__init__()
16
+ self.rank = rank
17
+ self.alpha = alpha
18
+ self.scaling = alpha / rank
19
+
20
+ self.lora_A = nn.Parameter(torch.zeros(in_features, rank))
21
+ self.lora_B = nn.Parameter(torch.zeros(rank, out_features))
22
+
23
+ self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()
24
+
25
+ nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
26
+ nn.init.zeros_(self.lora_B)
27
+
28
+ self.merged = False
29
+
30
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
31
+ """前向传播"""
32
+ result = x @ self.lora_A @ self.lora_B
33
+ result = self.dropout(result)
34
+ return result * self.scaling
35
+
36
+ class LinearWithLoRA(nn.Module):
37
+ """带LoRA的线性层"""
38
+ def __init__(
39
+ self,
40
+ in_features: int,
41
+ out_features: int,
42
+ bias: bool = True,
43
+ use_lora: bool = False,
44
+ lora_rank: int = 8,
45
+ lora_alpha: float = 16.0,
46
+ lora_dropout: float = 0.0
47
+ ):
48
+ super().__init__()
49
+ self.in_features = in_features
50
+ self.out_features = out_features
51
+ self.use_lora = use_lora
52
+
53
+ self.base_linear = nn.Linear(in_features, out_features, bias=bias)
54
+
55
+ if use_lora:
56
+ self.lora = LoRALayer(
57
+ in_features,
58
+ out_features,
59
+ lora_rank,
60
+ lora_alpha,
61
+ lora_dropout
62
+ )
63
+ self.merged = False
64
+ else:
65
+ self.lora = None
66
+ self.merged = False
67
+
68
+ def merge(self):
69
+ """将LoRA权重合并到基础权重中"""
70
+ if self.use_lora and not self.merged:
71
+ lora_weight = (self.lora.lora_A @ self.lora.lora_B) * self.lora.scaling
72
+ self.base_linear.weight.data += lora_weight.T
73
+ self.merged = True
74
+
75
+ def unmerge(self):
76
+ """取消合并LoRA权重"""
77
+ if self.use_lora and self.merged:
78
+ lora_weight = (self.lora.lora_A @ self.lora.lora_B) * self.lora.scaling
79
+ self.base_linear.weight.data -= lora_weight.T
80
+ self.merged = False
81
+
82
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
83
+ """前向传播"""
84
+ output = self.base_linear(x)
85
+
86
+ if self.use_lora and self.lora is not None and not self.merged:
87
+ output = output + self.lora(x)
88
+
89
+ return output
90
+
91
+ class AdapterLayer(nn.Module):
92
+ """Adapter层 - 轻量级微调"""
93
+ def __init__(
94
+ self,
95
+ dim: int,
96
+ bottleneck_dim: int = 64,
97
+ dropout: float = 0.1,
98
+ activation: str = 'gelu',
99
+ residual_scale: float = 1.0
100
+ ):
101
+ super().__init__()
102
+ self.residual_scale = residual_scale
103
+
104
+ self.down_proj = nn.Linear(dim, bottleneck_dim)
105
+
106
+ if activation == 'gelu':
107
+ self.activation = nn.GELU()
108
+ elif activation == 'relu':
109
+ self.activation = nn.ReLU()
110
+ elif activation == 'silu':
111
+ self.activation = nn.SiLU()
112
+ else:
113
+ self.activation = nn.GELU()
114
+
115
+ self.up_proj = nn.Linear(bottleneck_dim, dim)
116
+ self.dropout = nn.Dropout(dropout)
117
+
118
+ from components import RMSNorm
119
+ self.layer_norm = RMSNorm(dim)
120
+
121
+ self._init_weights()
122
+
123
+ def _init_weights(self):
124
+ """初始化权重"""
125
+ nn.init.kaiming_uniform_(self.down_proj.weight, a=math.sqrt(5))
126
+ nn.init.zeros_(self.up_proj.weight)
127
+ if self.down_proj.bias is not None:
128
+ nn.init.zeros_(self.down_proj.bias)
129
+ if self.up_proj.bias is not None:
130
+ nn.init.zeros_(self.up_proj.bias)
131
+
132
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
133
+ """前向传播"""
134
+ residual = x
135
+
136
+ x = self.layer_norm(x)
137
+ x = self.down_proj(x)
138
+ x = self.activation(x)
139
+ x = self.dropout(x)
140
+ x = self.up_proj(x)
141
+ x = self.dropout(x)
142
+
143
+ return residual + x * self.residual_scale
144
+
145
+ class PrefixTuning(nn.Module):
146
+ """Prefix Tuning"""
147
+ def __init__(
148
+ self,
149
+ num_layers: int,
150
+ num_tokens: int,
151
+ dim: int,
152
+ num_heads: int
153
+ ):
154
+ super().__init__()
155
+ self.num_layers = num_layers
156
+ self.num_tokens = num_tokens
157
+ self.dim = dim
158
+ self.num_heads = num_heads
159
+
160
+ head_dim = dim // num_heads
161
+ self.prefix = nn.Parameter(
162
+ torch.randn(num_layers, 2, num_tokens, num_heads, head_dim)
163
+ )
164
+
165
+ nn.init.normal_(self.prefix, std=0.02)
166
+
167
+ def forward(self, layer_idx: int, batch_size: int) -> torch.Tensor:
168
+ """获取指定层的prefix"""
169
+ prefix = self.prefix[layer_idx]
170
+ prefix = prefix.unsqueeze(1).expand(
171
+ 2, batch_size, self.num_heads, self.num_tokens, -1
172
+ )
173
+
174
+ return prefix
175
+
176
+ class PromptTuning(nn.Module):
177
+ """Prompt Tuning"""
178
+ def __init__(
179
+ self,
180
+ num_tokens: int,
181
+ dim: int,
182
+ init_from_vocab: bool = False,
183
+ vocab_embeddings: nn.Embedding = None
184
+ ):
185
+ super().__init__()
186
+ self.num_tokens = num_tokens
187
+ self.dim = dim
188
+
189
+ self.prompt_embeddings = nn.Parameter(torch.randn(num_tokens, dim))
190
+
191
+ if init_from_vocab and vocab_embeddings is not None:
192
+ indices = torch.randint(0, vocab_embeddings.num_embeddings, (num_tokens,))
193
+ self.prompt_embeddings.data = vocab_embeddings.weight[indices].clone()
194
+ else:
195
+ nn.init.normal_(self.prompt_embeddings, std=0.02)
196
+
197
+ def forward(self, batch_size: int) -> torch.Tensor:
198
+ """获取prompt embeddings"""
199
+ return self.prompt_embeddings.unsqueeze(0).expand(batch_size, -1, -1)
200
+
201
+ class IALayer(nn.Module):
202
+ """(IA)³层"""
203
+ def __init__(self, dim: int):
204
+ super().__init__()
205
+ self.scale = nn.Parameter(torch.ones(dim))
206
+
207
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
208
+ """应用缩放"""
 
 
 
 
209
  return x * self.scale