Spaces:

Antigma
/

Abliteration

Paused

App Files Files Community

Brianpuz commited on Jul 1, 2025

Commit

fbb861f

1 Parent(s): 5af26ce

Fix message format

Browse files

Files changed (1) hide show

app.py +30 -24

app.py CHANGED Viewed

@@ -104,7 +104,7 @@ class AbliterationProcessor:
     def process_abliteration(self, model_id, harmful_text, harmless_text, instructions,
                            scale_factor, skip_begin, skip_end, layer_fraction,
                            private_repo, export_to_org, repo_owner, org_token, oauth_token: gr.OAuthToken | None,
-                           progress=gr.Progress(track_tqdm=False)):
         """Execute abliteration processing and upload to HuggingFace"""
         if oauth_token is None or oauth_token.token is None:
             return (
@@ -127,12 +127,12 @@ class AbliterationProcessor:
             repo_owner = "self"
         try:
-            progress(desc="STEP 1/14: Loading model...")
             # Load model
             if self.model is None or self.tokenizer is None:
                 self.load_model(model_id)
-            progress(desc="STEP 2/14: Parsing instructions...")
             # Parse text content
             harmful_instructions = [line.strip() for line in harmful_text.strip().split('\n') if line.strip()]
             harmless_instructions = [line.strip() for line in harmless_text.strip().split('\n') if line.strip()]
@@ -141,12 +141,12 @@ class AbliterationProcessor:
             harmful_instructions = random.sample(harmful_instructions, min(instructions, len(harmful_instructions)))
             harmless_instructions = random.sample(harmless_instructions, min(instructions, len(harmless_instructions)))
-            progress(desc="STEP 3/14: Calculating layer index...")
             # Calculate layer index
             layer_idx = int(len(self.model.model.layers) * layer_fraction)
             pos = -1
-            progress(desc="STEP 4/14: Generating harmful tokens...")
             # Generate tokens
             harmful_toks = [
                 self.tokenizer.apply_chat_template(
@@ -156,7 +156,7 @@ class AbliterationProcessor:
                 ) for insn in harmful_instructions
             ]
-            progress(desc="STEP 5/14: Generating harmless tokens...")
             harmless_toks = [
                 self.tokenizer.apply_chat_template(
                     conversation=[{"role": "user", "content": insn}],
@@ -175,13 +175,13 @@ class AbliterationProcessor:
                     output_hidden_states=True
                 )
-            progress(desc="STEP 6/14: Processing harmful instructions...")
             harmful_outputs = [generate(toks) for toks in harmful_toks]
-            progress(desc="STEP 7/14: Processing harmless instructions...")
             harmless_outputs = [generate(toks) for toks in harmless_toks]
-            progress(desc="STEP 8/14: Extracting hidden states...")
             # Extract hidden states
             harmful_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmful_outputs]
             harmless_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmless_outputs]
@@ -189,7 +189,7 @@ class AbliterationProcessor:
             harmful_mean = torch.stack(harmful_hidden).mean(dim=0)
             harmless_mean = torch.stack(harmless_hidden).mean(dim=0)
-            progress(desc="STEP 9/14: Calculating refusal direction...")
             # Calculate refusal direction
             refusal_dir = harmful_mean - harmless_mean
             refusal_dir = refusal_dir / refusal_dir.norm()
@@ -201,11 +201,11 @@ class AbliterationProcessor:
             self.refusal_dir = refusal_dir
             self.projection_matrix = projection_matrix
-            progress(desc="STEP 10/14: Updating model weights...")
             # Modify model weights
             self.modify_layer_weights_optimized(projection_matrix, skip_begin, skip_end, scale_factor, progress)
-            progress(desc="STEP 11/14: Preparing model for upload...")
             # Create temporary directory to save model
             with tempfile.TemporaryDirectory() as temp_dir:
                 # Save model in safetensors format
@@ -213,7 +213,7 @@ class AbliterationProcessor:
                 self.tokenizer.save_pretrained(temp_dir)
                 torch.save(self.refusal_dir, os.path.join(temp_dir, "refusal_dir.pt"))
-                progress(desc="STEP 12/14: Uploading to HuggingFace...")
                 # Upload to HuggingFace
                 repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
                 model_name = model_id.split("/")[-1]
@@ -237,7 +237,7 @@ class AbliterationProcessor:
                             repo_id=repo_id
                         )
-                progress(desc="STEP 13/14: Creating model card...")
                 # Create model card
                 try:
                     original_card = ModelCard.load(model_id, token=oauth_token.token)
@@ -252,7 +252,7 @@ class AbliterationProcessor:
                     repo_id=repo_id
                 )
-                progress(desc="STEP 14/14: Complete!")
                 return (
                     f'<h1>✅ DONE</h1><br/>Repo: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>',
                     f"llama{np.random.randint(9)}.png",
@@ -272,7 +272,7 @@ class AbliterationProcessor:
         for i, layer_idx in enumerate(layers_to_modify):
             if progress:
-                progress(desc=f"STEP 10/14: Updating layer {layer_idx+1}/{num_layers} (Layer {i+1}/{total_layers})")
             layer = self.model.model.layers[layer_idx]
@@ -296,9 +296,15 @@ class AbliterationProcessor:
         try:
             # Build conversation history
             conversation = []
-            for human, assistant in history:
-                conversation.append({"role": "user", "content": human})
-                conversation.append({"role": "assistant", "content": assistant})
             # Add current message
             conversation.append({"role": "user", "content": message})
@@ -530,12 +536,12 @@ def create_interface():
         # Chat functionality
         def user(user_message, history):
-            return "", history + [[user_message, None]]
         def bot(history):
-            if history and history[-1][1] is None:
-                response, _ = processor.chat(history[-1][0], history[:-1])
-                history[-1][1] = response
             return history
         msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
@@ -546,7 +552,7 @@ def create_interface():
             bot, chatbot, chatbot
         )
-        clear.click(lambda: None, None, chatbot, queue=False)
         # Bind organization selection event
         export_to_org.change(

     def process_abliteration(self, model_id, harmful_text, harmless_text, instructions,
                            scale_factor, skip_begin, skip_end, layer_fraction,
                            private_repo, export_to_org, repo_owner, org_token, oauth_token: gr.OAuthToken | None,
+                           progress=gr.Progress()):
         """Execute abliteration processing and upload to HuggingFace"""
         if oauth_token is None or oauth_token.token is None:
             return (
             repo_owner = "self"
         try:
+            progress(0, desc="STEP 1/14: Loading model...")
             # Load model
             if self.model is None or self.tokenizer is None:
                 self.load_model(model_id)
+            progress(0.1, desc="STEP 2/14: Parsing instructions...")
             # Parse text content
             harmful_instructions = [line.strip() for line in harmful_text.strip().split('\n') if line.strip()]
             harmless_instructions = [line.strip() for line in harmless_text.strip().split('\n') if line.strip()]
             harmful_instructions = random.sample(harmful_instructions, min(instructions, len(harmful_instructions)))
             harmless_instructions = random.sample(harmless_instructions, min(instructions, len(harmless_instructions)))
+            progress(0.2, desc="STEP 3/14: Calculating layer index...")
             # Calculate layer index
             layer_idx = int(len(self.model.model.layers) * layer_fraction)
             pos = -1
+            progress(0.3, desc="STEP 4/14: Generating harmful tokens...")
             # Generate tokens
             harmful_toks = [
                 self.tokenizer.apply_chat_template(
                 ) for insn in harmful_instructions
             ]
+            progress(0.4, desc="STEP 5/14: Generating harmless tokens...")
             harmless_toks = [
                 self.tokenizer.apply_chat_template(
                     conversation=[{"role": "user", "content": insn}],
                     output_hidden_states=True
                 )
+            progress(0.5, desc="STEP 6/14: Processing harmful instructions...")
             harmful_outputs = [generate(toks) for toks in harmful_toks]
+            progress(0.6, desc="STEP 7/14: Processing harmless instructions...")
             harmless_outputs = [generate(toks) for toks in harmless_toks]
+            progress(0.7, desc="STEP 8/14: Extracting hidden states...")
             # Extract hidden states
             harmful_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmful_outputs]
             harmless_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmless_outputs]
             harmful_mean = torch.stack(harmful_hidden).mean(dim=0)
             harmless_mean = torch.stack(harmless_hidden).mean(dim=0)
+            progress(0.8, desc="STEP 9/14: Calculating refusal direction...")
             # Calculate refusal direction
             refusal_dir = harmful_mean - harmless_mean
             refusal_dir = refusal_dir / refusal_dir.norm()
             self.refusal_dir = refusal_dir
             self.projection_matrix = projection_matrix
+            progress(0.85, desc="STEP 10/14: Updating model weights...")
             # Modify model weights
             self.modify_layer_weights_optimized(projection_matrix, skip_begin, skip_end, scale_factor, progress)
+            progress(0.9, desc="STEP 11/14: Preparing model for upload...")
             # Create temporary directory to save model
             with tempfile.TemporaryDirectory() as temp_dir:
                 # Save model in safetensors format
                 self.tokenizer.save_pretrained(temp_dir)
                 torch.save(self.refusal_dir, os.path.join(temp_dir, "refusal_dir.pt"))
+                progress(0.95, desc="STEP 12/14: Uploading to HuggingFace...")
                 # Upload to HuggingFace
                 repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
                 model_name = model_id.split("/")[-1]
                             repo_id=repo_id
                         )
+                progress(0.98, desc="STEP 13/14: Creating model card...")
                 # Create model card
                 try:
                     original_card = ModelCard.load(model_id, token=oauth_token.token)
                     repo_id=repo_id
                 )
+                progress(1.0, desc="STEP 14/14: Complete!")
                 return (
                     f'<h1>✅ DONE</h1><br/>Repo: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>',
                     f"llama{np.random.randint(9)}.png",
         for i, layer_idx in enumerate(layers_to_modify):
             if progress:
+                progress(0.85 + 0.1 * (i / total_layers), desc=f"STEP 10/14: Updating layer {layer_idx+1}/{num_layers} (Layer {i+1}/{total_layers})")
             layer = self.model.model.layers[layer_idx]
         try:
             # Build conversation history
             conversation = []
+            for msg in history:
+                if isinstance(msg, dict) and "role" in msg and "content" in msg:
+                    # New format: {"role": "user", "content": "..."}
+                    conversation.append(msg)
+                elif isinstance(msg, list) and len(msg) == 2:
+                    # Old format: [user_msg, assistant_msg]
+                    conversation.append({"role": "user", "content": msg[0]})
+                    if msg[1]:  # Only add assistant message if it exists
+                        conversation.append({"role": "assistant", "content": msg[1]})
             # Add current message
             conversation.append({"role": "user", "content": message})
         # Chat functionality
         def user(user_message, history):
+            return "", history + [{"role": "user", "content": user_message}]
         def bot(history):
+            if history and history[-1]["role"] == "user":
+                response, _ = processor.chat(history[-1]["content"], history[:-1])
+                history.append({"role": "assistant", "content": response})
             return history
         msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
             bot, chatbot, chatbot
         )
+        clear.click(lambda: [], None, chatbot, queue=False)
         # Bind organization selection event
         export_to_org.change(