Spaces:

Antigma
/

Abliteration

Paused

App Files Files Community

Brianpuz commited on Jul 1, 2025

Commit

4c71e7c

1 Parent(s): 787eccb

fix problems

Browse files

Files changed (1) hide show

app.py +48 -28

app.py CHANGED Viewed

@@ -81,23 +81,30 @@ class AbliterationProcessor:
     def load_model(self, model_id):
         """Load model and tokenizer"""
         try:
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_id,
                 trust_remote_code=True,
-                torch_dtype=torch.float16
             )
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_id,
                 trust_remote_code=True
             )
-            return f"✅ Model {model_id} loaded successfully!"
         except Exception as e:
             return f"❌ Model loading failed: {str(e)}"
     def process_abliteration(self, model_id, harmful_text, harmless_text, instructions,
                            scale_factor, skip_begin, skip_end, layer_fraction,
                            private_repo, export_to_org, repo_owner, org_token, oauth_token: gr.OAuthToken | None,
-                           progress=gr.Progress()):
         """Execute abliteration processing and upload to HuggingFace"""
         if oauth_token is None or oauth_token.token is None:
             return (
@@ -120,12 +127,12 @@ class AbliterationProcessor:
             repo_owner = "self"
         try:
-            progress(0, desc="STEP 1/14: Loading model...")
             # Load model
             if self.model is None or self.tokenizer is None:
                 self.load_model(model_id)
-            progress(0, desc="STEP 2/14: Parsing instructions...")
             # Parse text content
             harmful_instructions = [line.strip() for line in harmful_text.strip().split('\n') if line.strip()]
             harmless_instructions = [line.strip() for line in harmless_text.strip().split('\n') if line.strip()]
@@ -134,12 +141,12 @@ class AbliterationProcessor:
             harmful_instructions = random.sample(harmful_instructions, min(instructions, len(harmful_instructions)))
             harmless_instructions = random.sample(harmless_instructions, min(instructions, len(harmless_instructions)))
-            progress(0, desc="STEP 3/14: Calculating layer index...")
             # Calculate layer index
             layer_idx = int(len(self.model.model.layers) * layer_fraction)
             pos = -1
-            progress(0, desc="STEP 4/14: Generating harmful tokens...")
             # Generate tokens
             harmful_toks = [
                 self.tokenizer.apply_chat_template(
@@ -149,7 +156,7 @@ class AbliterationProcessor:
                 ) for insn in harmful_instructions
             ]
-            progress(0, desc="STEP 5/14: Generating harmless tokens...")
             harmless_toks = [
                 self.tokenizer.apply_chat_template(
                     conversation=[{"role": "user", "content": insn}],
@@ -168,13 +175,13 @@ class AbliterationProcessor:
                     output_hidden_states=True
                 )
-            progress(0, desc="STEP 6/14: Processing harmful instructions...")
             harmful_outputs = [generate(toks) for toks in harmful_toks]
-            progress(0, desc="STEP 7/14: Processing harmless instructions...")
             harmless_outputs = [generate(toks) for toks in harmless_toks]
-            progress(0, desc="STEP 8/14: Extracting hidden states...")
             # Extract hidden states
             harmful_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmful_outputs]
             harmless_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmless_outputs]
@@ -182,7 +189,7 @@ class AbliterationProcessor:
             harmful_mean = torch.stack(harmful_hidden).mean(dim=0)
             harmless_mean = torch.stack(harmless_hidden).mean(dim=0)
-            progress(0, desc="STEP 9/14: Calculating refusal direction...")
             # Calculate refusal direction
             refusal_dir = harmful_mean - harmless_mean
             refusal_dir = refusal_dir / refusal_dir.norm()
@@ -194,11 +201,11 @@ class AbliterationProcessor:
             self.refusal_dir = refusal_dir
             self.projection_matrix = projection_matrix
-            progress(0, desc="STEP 10/14: Updating model weights...")
             # Modify model weights
             self.modify_layer_weights_optimized(projection_matrix, skip_begin, skip_end, scale_factor, progress)
-            progress(0, desc="STEP 11/14: Preparing model for upload...")
             # Create temporary directory to save model
             with tempfile.TemporaryDirectory() as temp_dir:
                 # Save model in safetensors format
@@ -206,7 +213,7 @@ class AbliterationProcessor:
                 self.tokenizer.save_pretrained(temp_dir)
                 torch.save(self.refusal_dir, os.path.join(temp_dir, "refusal_dir.pt"))
-                progress(0, desc="STEP 12/14: Uploading to HuggingFace...")
                 # Upload to HuggingFace
                 repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
                 model_name = model_id.split("/")[-1]
@@ -230,7 +237,7 @@ class AbliterationProcessor:
                             repo_id=repo_id
                         )
-                progress(0, desc="STEP 13/14: Creating model card...")
                 # Create model card
                 try:
                     original_card = ModelCard.load(model_id, token=oauth_token.token)
@@ -245,7 +252,7 @@ class AbliterationProcessor:
                     repo_id=repo_id
                 )
-                progress(0, desc="STEP 14/14: Complete!")
                 return (
                     f'<h1>✅ DONE</h1><br/>Repo: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>',
                     f"llama{np.random.randint(9)}.png",
@@ -265,7 +272,7 @@ class AbliterationProcessor:
         for i, layer_idx in enumerate(layers_to_modify):
             if progress:
-                progress(0, desc=f"STEP 10/14: Updating layer {layer_idx+1}/{num_layers} (Layer {i+1}/{total_layers})")
             layer = self.model.model.layers[layer_idx]
@@ -303,22 +310,34 @@ class AbliterationProcessor:
                 return_tensors="pt"
             )
-            # Generate response
             gen = self.model.generate(
                 toks.to(self.model.device),
                 max_new_tokens=2048,
                 temperature=0.7,
                 do_sample=True,
-                pad_token_id=self.tokenizer.eos_token_id
-            )
-            # Decode response
-            decoded = self.tokenizer.batch_decode(
-                gen[0][len(toks[0]):],
-                skip_special_tokens=True
             )
-            response = "".join(decoded).strip()
             return response, history + [[message, response]]
         except Exception as e:
@@ -473,7 +492,8 @@ def create_interface():
             with gr.TabItem("💬 Chat Test"):
                 chatbot = gr.Chatbot(
                     label="Chat Window",
-                    height=400
                 )
                 msg = gr.Textbox(
                     label="Input Message",

     def load_model(self, model_id):
         """Load model and tokenizer"""
         try:
+            # Auto-detect GPU
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            print(f"Using device: {device}")
             self.model = AutoModelForCausalLM.from_pretrained(
                 model_id,
                 trust_remote_code=True,
+                torch_dtype=torch.float16,
+                device_map="auto" if device == "cuda" else None
             )
             self.tokenizer = AutoTokenizer.from_pretrained(
                 model_id,
                 trust_remote_code=True
             )
+            device_info = f" on {device.upper()}" if device == "cuda" else ""
+            return f"✅ Model {model_id} loaded successfully{device_info}!"
         except Exception as e:
             return f"❌ Model loading failed: {str(e)}"
     def process_abliteration(self, model_id, harmful_text, harmless_text, instructions,
                            scale_factor, skip_begin, skip_end, layer_fraction,
                            private_repo, export_to_org, repo_owner, org_token, oauth_token: gr.OAuthToken | None,
+                           progress=gr.Progress(track_tqdm=False)):
         """Execute abliteration processing and upload to HuggingFace"""
         if oauth_token is None or oauth_token.token is None:
             return (
             repo_owner = "self"
         try:
+            progress(desc="STEP 1/14: Loading model...")
             # Load model
             if self.model is None or self.tokenizer is None:
                 self.load_model(model_id)
+            progress(desc="STEP 2/14: Parsing instructions...")
             # Parse text content
             harmful_instructions = [line.strip() for line in harmful_text.strip().split('\n') if line.strip()]
             harmless_instructions = [line.strip() for line in harmless_text.strip().split('\n') if line.strip()]
             harmful_instructions = random.sample(harmful_instructions, min(instructions, len(harmful_instructions)))
             harmless_instructions = random.sample(harmless_instructions, min(instructions, len(harmless_instructions)))
+            progress(desc="STEP 3/14: Calculating layer index...")
             # Calculate layer index
             layer_idx = int(len(self.model.model.layers) * layer_fraction)
             pos = -1
+            progress(desc="STEP 4/14: Generating harmful tokens...")
             # Generate tokens
             harmful_toks = [
                 self.tokenizer.apply_chat_template(
                 ) for insn in harmful_instructions
             ]
+            progress(desc="STEP 5/14: Generating harmless tokens...")
             harmless_toks = [
                 self.tokenizer.apply_chat_template(
                     conversation=[{"role": "user", "content": insn}],
                     output_hidden_states=True
                 )
+            progress(desc="STEP 6/14: Processing harmful instructions...")
             harmful_outputs = [generate(toks) for toks in harmful_toks]
+            progress(desc="STEP 7/14: Processing harmless instructions...")
             harmless_outputs = [generate(toks) for toks in harmless_toks]
+            progress(desc="STEP 8/14: Extracting hidden states...")
             # Extract hidden states
             harmful_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmful_outputs]
             harmless_hidden = [output.hidden_states[0][layer_idx][:, pos, :] for output in harmless_outputs]
             harmful_mean = torch.stack(harmful_hidden).mean(dim=0)
             harmless_mean = torch.stack(harmless_hidden).mean(dim=0)
+            progress(desc="STEP 9/14: Calculating refusal direction...")
             # Calculate refusal direction
             refusal_dir = harmful_mean - harmless_mean
             refusal_dir = refusal_dir / refusal_dir.norm()
             self.refusal_dir = refusal_dir
             self.projection_matrix = projection_matrix
+            progress(desc="STEP 10/14: Updating model weights...")
             # Modify model weights
             self.modify_layer_weights_optimized(projection_matrix, skip_begin, skip_end, scale_factor, progress)
+            progress(desc="STEP 11/14: Preparing model for upload...")
             # Create temporary directory to save model
             with tempfile.TemporaryDirectory() as temp_dir:
                 # Save model in safetensors format
                 self.tokenizer.save_pretrained(temp_dir)
                 torch.save(self.refusal_dir, os.path.join(temp_dir, "refusal_dir.pt"))
+                progress(desc="STEP 12/14: Uploading to HuggingFace...")
                 # Upload to HuggingFace
                 repo_namespace = get_repo_namespace(repo_owner, username, user_orgs)
                 model_name = model_id.split("/")[-1]
                             repo_id=repo_id
                         )
+                progress(desc="STEP 13/14: Creating model card...")
                 # Create model card
                 try:
                     original_card = ModelCard.load(model_id, token=oauth_token.token)
                     repo_id=repo_id
                 )
+                progress(desc="STEP 14/14: Complete!")
                 return (
                     f'<h1>✅ DONE</h1><br/>Repo: <a href="{new_repo_url}" target="_blank" style="text-decoration:underline">{repo_id}</a>',
                     f"llama{np.random.randint(9)}.png",
         for i, layer_idx in enumerate(layers_to_modify):
             if progress:
+                progress(desc=f"STEP 10/14: Updating layer {layer_idx+1}/{num_layers} (Layer {i+1}/{total_layers})")
             layer = self.model.model.layers[layer_idx]
                 return_tensors="pt"
             )
+            # Generate response with streaming like abliterated_optimized.py
+            from transformers import TextStreamer
+            # Create a custom streamer that captures all output
+            captured_output = []
+            class CustomStreamer(TextStreamer):
+                def __init__(self, tokenizer, skip_prompt=True, skip_special_tokens=True):
+                    super().__init__(tokenizer, skip_prompt=skip_prompt, skip_special_tokens=skip_special_tokens)
+                    self.captured = []
+                def on_finalized_text(self, text: str, stream_end: bool = False):
+                    self.captured.append(text)
+                    super().on_finalized_text(text, stream_end)
+            streamer = CustomStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
             gen = self.model.generate(
                 toks.to(self.model.device),
                 max_new_tokens=2048,
                 temperature=0.7,
                 do_sample=True,
+                pad_token_id=self.tokenizer.eos_token_id,
+                streamer=streamer
             )
+            # Get the complete response from streamer
+            response = "".join(streamer.captured).strip()
             return response, history + [[message, response]]
         except Exception as e:
             with gr.TabItem("💬 Chat Test"):
                 chatbot = gr.Chatbot(
                     label="Chat Window",
+                    height=400,
+                    type="messages"
                 )
                 msg = gr.Textbox(
                     label="Input Message",