Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -26,48 +26,71 @@ def tokenize_instructions(tokenizer, instructions):
|
|
| 26 |
add_generation_prompt=True,
|
| 27 |
).input_ids
|
| 28 |
|
| 29 |
-
def find_steering_vecs(model, base_toks, target_toks, batch_size=16):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
device = model.device
|
| 31 |
num_its = len(range(0, base_toks.shape[0], batch_size))
|
| 32 |
steering_vecs = {}
|
| 33 |
-
for i in tqdm(range(0, base_toks.shape[0], batch_size)):
|
| 34 |
-
|
|
|
|
| 35 |
target_out = model(target_toks[i:i+batch_size].to(device), output_hidden_states=True).hidden_states
|
| 36 |
-
for layer in range(len(base_out)):
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
|
|
|
| 40 |
steering_vecs[layer] += torch.mean(target_out[layer][:,-1,:].detach().cpu() - base_out[layer][:,-1,:].detach().cpu(), dim=0)/num_its
|
| 41 |
return steering_vecs
|
| 42 |
|
| 43 |
-
def do_steering(model, test_toks, steering_vec, scale=1, normalise=True, layer=None, proj=True, batch_size=16):
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
for i in range(len(model.model.layers)):
|
| 59 |
-
if layer is None
|
| 60 |
handles.append(model.model.layers[i].register_forward_pre_hook(modify_activation()))
|
| 61 |
-
|
|
|
|
|
|
|
| 62 |
outs_all = []
|
| 63 |
for i in tqdm(range(0, test_toks.shape[0], batch_size)):
|
| 64 |
-
outs = model.generate(test_toks[i:i+batch_size],
|
| 65 |
outs_all.append(outs)
|
| 66 |
outs_all = torch.cat(outs_all, dim=0)
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
handle
|
| 70 |
-
|
| 71 |
return outs_all
|
| 72 |
|
| 73 |
def create_steering_vector(towards, away):
|
|
@@ -80,17 +103,25 @@ def create_steering_vector(towards, away):
|
|
| 80 |
steering_vecs = find_steering_vecs(model, away_toks, towards_toks)
|
| 81 |
return steering_vecs
|
| 82 |
|
| 83 |
-
def chat(message, history,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
history_formatted = [{"role": "user", "content": message}]
|
| 85 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
input_ids = tokenize_instructions(tokenizer, [history_formatted])
|
| 87 |
|
| 88 |
generations_baseline = do_steering(model, input_ids.to(device), None)
|
| 89 |
for j in range(generations_baseline.shape[0]):
|
| 90 |
-
response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True
|
| 91 |
|
| 92 |
if steering_vec is not None:
|
| 93 |
-
generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=
|
| 94 |
for j in range(generation_intervene.shape[0]):
|
| 95 |
response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
|
| 96 |
|
|
@@ -104,7 +135,7 @@ def chat(message, history, steering_vec, layer):
|
|
| 104 |
def launch_app():
|
| 105 |
with gr.Blocks() as demo:
|
| 106 |
steering_vec = gr.State(None)
|
| 107 |
-
layer = gr.State(
|
| 108 |
|
| 109 |
away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
|
| 110 |
|
|
@@ -129,6 +160,7 @@ def launch_app():
|
|
| 129 |
- Click the **"Create Steering Vector"** button to generate a vector that will nudge the model’s responses.
|
| 130 |
This vector will attempt to shift the model’s behavior towards the concepts in the "Towards" box and away from the concepts in the "Away" box.
|
| 131 |
- You can also adjust the **layer slider** to choose which layer of the model the steering vector will affect.
|
|
|
|
| 132 |
|
| 133 |
3. **Chat with the Model:**
|
| 134 |
- Type a message in the chatbox and press Enter. The model will generate two responses:
|
|
@@ -155,21 +187,25 @@ def launch_app():
|
|
| 155 |
|
| 156 |
with gr.Row():
|
| 157 |
create_vector = gr.Button("Create Steering Vector")
|
| 158 |
-
layer_slider = gr.Slider(minimum=
|
| 159 |
|
| 160 |
def create_vector_and_set_layer(towards, away, layer_value):
|
| 161 |
vectors = create_steering_vector(towards, away)
|
| 162 |
layer.value = int(layer_value)
|
| 163 |
steering_vec.value = vectors
|
|
|
|
| 164 |
return f"Steering vector created for layer {layer_value}"
|
| 165 |
create_vector.click(create_vector_and_set_layer, [towards, away, layer_slider], gr.Textbox())
|
| 166 |
|
| 167 |
chatbot = gr.Chatbot()
|
| 168 |
msg = gr.Textbox()
|
| 169 |
|
| 170 |
-
msg.submit(chat, [msg, chatbot,
|
| 171 |
|
| 172 |
demo.launch()
|
| 173 |
|
| 174 |
if __name__ == "__main__":
|
| 175 |
launch_app()
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
add_generation_prompt=True,
|
| 27 |
).input_ids
|
| 28 |
|
| 29 |
+
def find_steering_vecs(model, base_toks, target_toks, batch_size = 16):
|
| 30 |
+
'''
|
| 31 |
+
We want to find the steering vector from base_toks to target_toks (we do target_toks - base_toks)
|
| 32 |
+
Inputs:
|
| 33 |
+
:param model: the model to use
|
| 34 |
+
:param base_toks: the base tokens [len, seq_len]
|
| 35 |
+
:param target_toks: the target tokens [len, seq_len]
|
| 36 |
+
Output:
|
| 37 |
+
:return steering_vecs: the steering vectors [hidden_size]
|
| 38 |
+
'''
|
| 39 |
device = model.device
|
| 40 |
num_its = len(range(0, base_toks.shape[0], batch_size))
|
| 41 |
steering_vecs = {}
|
| 42 |
+
for i in tqdm(range(0, base_toks.shape[0], batch_size)):
|
| 43 |
+
# pass through the model
|
| 44 |
+
base_out = model(base_toks[i:i+batch_size].to(device), output_hidden_states=True).hidden_states # tuple of length num_layers with each element size [batch_size, seq_len, hidden_size]
|
| 45 |
target_out = model(target_toks[i:i+batch_size].to(device), output_hidden_states=True).hidden_states
|
| 46 |
+
for layer in range(len(base_out)):
|
| 47 |
+
# average over the batch_size, take last token
|
| 48 |
+
if i == 0:
|
| 49 |
+
steering_vecs[layer] = torch.mean(target_out[layer][:,-1,:].detach().cpu() - base_out[layer][:,-1,:].detach().cpu(), dim=0)/num_its # [hidden_size]
|
| 50 |
+
else:
|
| 51 |
steering_vecs[layer] += torch.mean(target_out[layer][:,-1,:].detach().cpu() - base_out[layer][:,-1,:].detach().cpu(), dim=0)/num_its
|
| 52 |
return steering_vecs
|
| 53 |
|
| 54 |
+
def do_steering(model, test_toks, steering_vec, scale = 1, normalise = True, layer = None, proj=True, batch_size=16):
|
| 55 |
+
'''
|
| 56 |
+
Input:
|
| 57 |
+
:param model: the model to use
|
| 58 |
+
:param test_toks: the test tokens [len, seq_len]
|
| 59 |
+
:param steering_vec: the steering vector [hidden_size]
|
| 60 |
+
:param scale: the scale to use
|
| 61 |
+
:param layer: the layer to modify; if None: we modify all layers.
|
| 62 |
+
:param proj: whether to project the steering vector
|
| 63 |
+
Output:
|
| 64 |
+
:return output: the steered model output [len, generated_seq_len]
|
| 65 |
+
'''
|
| 66 |
+
# define a hook to modify the input into the layer
|
| 67 |
+
if steering_vec is not None:
|
| 68 |
+
def modify_activation():
|
| 69 |
+
def hook(model, input):
|
| 70 |
+
if normalise:
|
| 71 |
+
sv = steering_vec / steering_vec.norm()
|
| 72 |
+
else:
|
| 73 |
+
sv = steering_vec
|
| 74 |
+
if proj:
|
| 75 |
+
sv = einsum(input[0], sv.view(-1,1), 'b l h, h s -> b l s') * sv
|
| 76 |
+
input[0][:,:,:] = input[0][:,:,:] - scale * sv
|
| 77 |
+
return hook
|
| 78 |
+
handles = []
|
| 79 |
for i in range(len(model.model.layers)):
|
| 80 |
+
if layer is None: # append to each layer
|
| 81 |
handles.append(model.model.layers[i].register_forward_pre_hook(modify_activation()))
|
| 82 |
+
elif layer is not None and i == layer:
|
| 83 |
+
handles.append(model.model.layers[i].register_forward_pre_hook(modify_activation()))
|
| 84 |
+
# pass through the model
|
| 85 |
outs_all = []
|
| 86 |
for i in tqdm(range(0, test_toks.shape[0], batch_size)):
|
| 87 |
+
outs = model.generate(test_toks[i:i+batch_size], max_new_tokens=60) # [num_samples, seq_len]
|
| 88 |
outs_all.append(outs)
|
| 89 |
outs_all = torch.cat(outs_all, dim=0)
|
| 90 |
+
# remove all hooks
|
| 91 |
+
if steering_vec is not None:
|
| 92 |
+
for handle in handles:
|
| 93 |
+
handle.remove()
|
| 94 |
return outs_all
|
| 95 |
|
| 96 |
def create_steering_vector(towards, away):
|
|
|
|
| 103 |
steering_vecs = find_steering_vecs(model, away_toks, towards_toks)
|
| 104 |
return steering_vecs
|
| 105 |
|
| 106 |
+
def chat(message, history, towards, away, layer_value):
|
| 107 |
+
|
| 108 |
+
steering_vec = create_steering_vector(towards, away)
|
| 109 |
+
layer = int(layer_value)
|
| 110 |
+
|
| 111 |
history_formatted = [{"role": "user", "content": message}]
|
| 112 |
|
| 113 |
+
print(f"layer {layer}")
|
| 114 |
+
print(f"steering vec {steering_vec}")
|
| 115 |
+
print(f"steering vec chosen {steering_vec[layer]}")
|
| 116 |
+
|
| 117 |
input_ids = tokenize_instructions(tokenizer, [history_formatted])
|
| 118 |
|
| 119 |
generations_baseline = do_steering(model, input_ids.to(device), None)
|
| 120 |
for j in range(generations_baseline.shape[0]):
|
| 121 |
+
response_baseline = f"BASELINE: {tokenizer.decode(generations_baseline[j], skip_special_tokens=True)}"
|
| 122 |
|
| 123 |
if steering_vec is not None:
|
| 124 |
+
generation_intervene = do_steering(model, input_ids.to(device), steering_vec[layer].to(device), scale=3, layer=layer)
|
| 125 |
for j in range(generation_intervene.shape[0]):
|
| 126 |
response_intervention = f"INTERVENTION: {tokenizer.decode(generation_intervene[j], skip_special_tokens=True)}"
|
| 127 |
|
|
|
|
| 135 |
def launch_app():
|
| 136 |
with gr.Blocks() as demo:
|
| 137 |
steering_vec = gr.State(None)
|
| 138 |
+
layer = gr.State(6)
|
| 139 |
|
| 140 |
away_default = ['hate','i hate this', 'hating the', 'hater', 'hating', 'hated in']
|
| 141 |
|
|
|
|
| 160 |
- Click the **"Create Steering Vector"** button to generate a vector that will nudge the model’s responses.
|
| 161 |
This vector will attempt to shift the model’s behavior towards the concepts in the "Towards" box and away from the concepts in the "Away" box.
|
| 162 |
- You can also adjust the **layer slider** to choose which layer of the model the steering vector will affect.
|
| 163 |
+
- make sure you have equal examples of towards & away or the app will throw an error
|
| 164 |
|
| 165 |
3. **Chat with the Model:**
|
| 166 |
- Type a message in the chatbox and press Enter. The model will generate two responses:
|
|
|
|
| 187 |
|
| 188 |
with gr.Row():
|
| 189 |
create_vector = gr.Button("Create Steering Vector")
|
| 190 |
+
layer_slider = gr.Slider(minimum=1, maximum=len(model.model.layers)-1, step=1, label="Layer")
|
| 191 |
|
| 192 |
def create_vector_and_set_layer(towards, away, layer_value):
|
| 193 |
vectors = create_steering_vector(towards, away)
|
| 194 |
layer.value = int(layer_value)
|
| 195 |
steering_vec.value = vectors
|
| 196 |
+
print(f"layer {layer.value}")
|
| 197 |
return f"Steering vector created for layer {layer_value}"
|
| 198 |
create_vector.click(create_vector_and_set_layer, [towards, away, layer_slider], gr.Textbox())
|
| 199 |
|
| 200 |
chatbot = gr.Chatbot()
|
| 201 |
msg = gr.Textbox()
|
| 202 |
|
| 203 |
+
msg.submit(chat, [msg, chatbot, towards, away, layer_slider], chatbot)
|
| 204 |
|
| 205 |
demo.launch()
|
| 206 |
|
| 207 |
if __name__ == "__main__":
|
| 208 |
launch_app()
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
## steering vec is being generated correctly, why is it NOT passing through?
|