Brianpuz commited on
Commit
fbb861f
·
1 Parent(s): 5af26ce

Fix message format

Browse files
Files changed (1) hide show
  1. app.py +30 -24
app.py CHANGED
@@ -104,7 +104,7 @@ class AbliterationProcessor:
104
  def process_abliteration(self, model_id, harmful_text, harmless_text, instructions,
105
  scale_factor, skip_begin, skip_end, layer_fraction,
106
  private_repo, export_to_org, repo_owner, org_token, oauth_token: gr.OAuthToken | None,
107
- progress=gr.Progress(track_tqdm=False)):
108
  """Execute abliteration processing and upload to HuggingFace"""
109
  if oauth_token is None or oauth_token.token is None:
110
  return (
@@ -127,12 +127,12 @@ class AbliterationProcessor:
127
  repo_owner = "self"
128
 
129
  try:
130
- progress(desc="STEP 1/14: Loading model...")
131
  # Load model
132
  if self.model is None or self.tokenizer is None:
133
  self.load_model(model_id)
134
 
135
- progress(desc="STEP 2/14: Parsing instructions...")
136
  # Parse text content
137
  harmful_instructions = [line.strip() for line in harmful_text.strip().split('\n') if line.strip()]
138
  harmless_instructions = [line.strip() for line in harmless_text.strip().split('\n') if line.strip()]
@@ -141,12 +141,12 @@ class AbliterationProcessor:
141
  harmful_instructions = random.sample(harmful_instructions, min(instructions, len(harmful_instructions)))
142
  harmless_instructions = random.sample(harmless_instructions, min(instructions, len(harmless_instructions)))
143
 
144
- progress(desc="STEP 3/14: Calculating layer index...")
145
  # Calculate layer index
146
  layer_idx = int(len(self.model.model.layers) * layer_fraction)
147
  pos = -1
148
 
149
- progress(desc="STEP 4/14: Generating harmful tokens...")
150
  # Generate tokens
151
  harmful_toks = [
152
  self.tokenizer.apply_chat_template(
@@ -156,7 +156,7 @@ class AbliterationProcessor:
156
  ) for insn in harmful_instructions
157
  ]
158
 
159
- progress(desc="STEP 5/14: Generating harmless tokens...")
160
  harmless_toks = [
161
  self.tokenizer.apply_chat_template(
162
  conversation=[{"role": "user", "content": insn}],
@@ -175,13 +175,13 @@ class AbliterationProcessor:
175
  output_hidden_states=True
176
  )
177
 
178
- progress(desc="STEP 6/14: Processing harmful instructions...")
179
  harmful_outputs = [generate(toks) for toks in harmful_toks]
180
 
181
- progress(desc="STEP 7/14: Processing harmless instructions...")
182
  harmless_outputs = [generate(toks) for toks in harmless_toks]
183
 
184
- progress(desc="STEP 8/14: Extracting hidden states...")
185
  # Extract hidden states
186
  harmful_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmful_outputs]
187
  harmless_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmless_outputs]
@@ -189,7 +189,7 @@ class AbliterationProcessor:
189
  harmful_mean = torch.stack(harmful_hidden).mean(dim=0)
190
  harmless_mean = torch.stack(harmless_hidden).mean(dim=0)
191
 
192
- progress(desc="STEP 9/14: Calculating refusal direction...")
193
  # Calculate refusal direction
194
  refusal_dir = harmful_mean - harmless_mean
195
  refusal_dir = refusal_dir / refusal_dir.norm()
@@ -201,11 +201,11 @@ class AbliterationProcessor:
201
  self.refusal_dir = refusal_dir
202
  self.projection_matrix = projection_matrix
203
 
204
- progress(desc="STEP 10/14: Updating model weights...")
205
  # Modify model weights
206
  self.modify_layer_weights_optimized(projection_matrix, skip_begin, skip_end, scale_factor, progress)
207
 
208
- progress(desc="STEP 11/14: Preparing model for upload...")
209
  # Create temporary directory to save model
210
  with tempfile.TemporaryDirectory() as temp_dir:
211
  # Save model in safetensors format
@@ -213,7 +213,7 @@ class AbliterationProcessor:
213
  self.tokenizer.save_pretrained(temp_dir)
214
  torch.save(self.refusal_dir, os.path.join(temp_dir, "refusal_dir.pt"))
215
 
216
- progress(desc="STEP 12/14: Uploading to HuggingFace...")
217
  # Upload to HuggingFace
218
  repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
219
  model_name = model_id.split("/")[-1]
@@ -237,7 +237,7 @@ class AbliterationProcessor:
237
  repo_id=repo_id
238
  )
239
 
240
- progress(desc="STEP 13/14: Creating model card...")
241
  # Create model card
242
  try:
243
  original_card = ModelCard.load(model_id, token=oauth_token.token)
@@ -252,7 +252,7 @@ class AbliterationProcessor:
252
  repo_id=repo_id
253
  )
254
 
255
- progress(desc="STEP 14/14: Complete!")
256
  return (
257
  f'<h1>✅ DONE</h1><br/>Repo: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>',
258
  f"llama{np.random.randint(9)}.png",
@@ -272,7 +272,7 @@ class AbliterationProcessor:
272
 
273
  for i, layer_idx in enumerate(layers_to_modify):
274
  if progress:
275
- progress(desc=f"STEP 10/14: Updating layer {layer_idx+1}/{num_layers} (Layer {i+1}/{total_layers})")
276
 
277
  layer = self.model.model.layers[layer_idx]
278
 
@@ -296,9 +296,15 @@ class AbliterationProcessor:
296
  try:
297
  # Build conversation history
298
  conversation = []
299
- for human, assistant in history:
300
- conversation.append({"role": "user", "content": human})
301
- conversation.append({"role": "assistant", "content": assistant})
 
 
 
 
 
 
302
 
303
  # Add current message
304
  conversation.append({"role": "user", "content": message})
@@ -530,12 +536,12 @@ def create_interface():
530
 
531
  # Chat functionality
532
  def user(user_message, history):
533
- return "", history + [[user_message, None]]
534
 
535
  def bot(history):
536
- if history and history[-1][1] is None:
537
- response, _ = processor.chat(history[-1][0], history[:-1])
538
- history[-1][1] = response
539
  return history
540
 
541
  msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
@@ -546,7 +552,7 @@ def create_interface():
546
  bot, chatbot, chatbot
547
  )
548
 
549
- clear.click(lambda: None, None, chatbot, queue=False)
550
 
551
  # Bind organization selection event
552
  export_to_org.change(
 
104
  def process_abliteration(self, model_id, harmful_text, harmless_text, instructions,
105
  scale_factor, skip_begin, skip_end, layer_fraction,
106
  private_repo, export_to_org, repo_owner, org_token, oauth_token: gr.OAuthToken | None,
107
+ progress=gr.Progress()):
108
  """Execute abliteration processing and upload to HuggingFace"""
109
  if oauth_token is None or oauth_token.token is None:
110
  return (
 
127
  repo_owner = "self"
128
 
129
  try:
130
+ progress(0, desc="STEP 1/14: Loading model...")
131
  # Load model
132
  if self.model is None or self.tokenizer is None:
133
  self.load_model(model_id)
134
 
135
+ progress(0.1, desc="STEP 2/14: Parsing instructions...")
136
  # Parse text content
137
  harmful_instructions = [line.strip() for line in harmful_text.strip().split('\n') if line.strip()]
138
  harmless_instructions = [line.strip() for line in harmless_text.strip().split('\n') if line.strip()]
 
141
  harmful_instructions = random.sample(harmful_instructions, min(instructions, len(harmful_instructions)))
142
  harmless_instructions = random.sample(harmless_instructions, min(instructions, len(harmless_instructions)))
143
 
144
+ progress(0.2, desc="STEP 3/14: Calculating layer index...")
145
  # Calculate layer index
146
  layer_idx = int(len(self.model.model.layers) * layer_fraction)
147
  pos = -1
148
 
149
+ progress(0.3, desc="STEP 4/14: Generating harmful tokens...")
150
  # Generate tokens
151
  harmful_toks = [
152
  self.tokenizer.apply_chat_template(
 
156
  ) for insn in harmful_instructions
157
  ]
158
 
159
+ progress(0.4, desc="STEP 5/14: Generating harmless tokens...")
160
  harmless_toks = [
161
  self.tokenizer.apply_chat_template(
162
  conversation=[{"role": "user", "content": insn}],
 
175
  output_hidden_states=True
176
  )
177
 
178
+ progress(0.5, desc="STEP 6/14: Processing harmful instructions...")
179
  harmful_outputs = [generate(toks) for toks in harmful_toks]
180
 
181
+ progress(0.6, desc="STEP 7/14: Processing harmless instructions...")
182
  harmless_outputs = [generate(toks) for toks in harmless_toks]
183
 
184
+ progress(0.7, desc="STEP 8/14: Extracting hidden states...")
185
  # Extract hidden states
186
  harmful_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmful_outputs]
187
  harmless_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmless_outputs]
 
189
  harmful_mean = torch.stack(harmful_hidden).mean(dim=0)
190
  harmless_mean = torch.stack(harmless_hidden).mean(dim=0)
191
 
192
+ progress(0.8, desc="STEP 9/14: Calculating refusal direction...")
193
  # Calculate refusal direction
194
  refusal_dir = harmful_mean - harmless_mean
195
  refusal_dir = refusal_dir / refusal_dir.norm()
 
201
  self.refusal_dir = refusal_dir
202
  self.projection_matrix = projection_matrix
203
 
204
+ progress(0.85, desc="STEP 10/14: Updating model weights...")
205
  # Modify model weights
206
  self.modify_layer_weights_optimized(projection_matrix, skip_begin, skip_end, scale_factor, progress)
207
 
208
+ progress(0.9, desc="STEP 11/14: Preparing model for upload...")
209
  # Create temporary directory to save model
210
  with tempfile.TemporaryDirectory() as temp_dir:
211
  # Save model in safetensors format
 
213
  self.tokenizer.save_pretrained(temp_dir)
214
  torch.save(self.refusal_dir, os.path.join(temp_dir, "refusal_dir.pt"))
215
 
216
+ progress(0.95, desc="STEP 12/14: Uploading to HuggingFace...")
217
  # Upload to HuggingFace
218
  repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
219
  model_name = model_id.split("/")[-1]
 
237
  repo_id=repo_id
238
  )
239
 
240
+ progress(0.98, desc="STEP 13/14: Creating model card...")
241
  # Create model card
242
  try:
243
  original_card = ModelCard.load(model_id, token=oauth_token.token)
 
252
  repo_id=repo_id
253
  )
254
 
255
+ progress(1.0, desc="STEP 14/14: Complete!")
256
  return (
257
  f'<h1>✅ DONE</h1><br/>Repo: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>',
258
  f"llama{np.random.randint(9)}.png",
 
272
 
273
  for i, layer_idx in enumerate(layers_to_modify):
274
  if progress:
275
+ progress(0.85 + 0.1 * (i / total_layers), desc=f"STEP 10/14: Updating layer {layer_idx+1}/{num_layers} (Layer {i+1}/{total_layers})")
276
 
277
  layer = self.model.model.layers[layer_idx]
278
 
 
296
  try:
297
  # Build conversation history
298
  conversation = []
299
+ for msg in history:
300
+ if isinstance(msg, dict) and "role" in msg and "content" in msg:
301
+ # New format: {"role": "user", "content": "..."}
302
+ conversation.append(msg)
303
+ elif isinstance(msg, list) and len(msg) == 2:
304
+ # Old format: [user_msg, assistant_msg]
305
+ conversation.append({"role": "user", "content": msg[0]})
306
+ if msg[1]: # Only add assistant message if it exists
307
+ conversation.append({"role": "assistant", "content": msg[1]})
308
 
309
  # Add current message
310
  conversation.append({"role": "user", "content": message})
 
536
 
537
  # Chat functionality
538
  def user(user_message, history):
539
+ return "", history + [{"role": "user", "content": user_message}]
540
 
541
  def bot(history):
542
+ if history and history[-1]["role"] == "user":
543
+ response, _ = processor.chat(history[-1]["content"], history[:-1])
544
+ history.append({"role": "assistant", "content": response})
545
  return history
546
 
547
  msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
 
552
  bot, chatbot, chatbot
553
  )
554
 
555
+ clear.click(lambda: [], None, chatbot, queue=False)
556
 
557
  # Bind organization selection event
558
  export_to_org.change(