|
|
|
|
|
""" |
|
|
Track per-neuron activations in Qwen2 MLP layers using Hugging Face Transformers |
|
|
with explicit device management. |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import os |
|
|
from types import MethodType |
|
|
|
|
|
import torch |
|
|
from torch import Tensor |
|
|
from tqdm import tqdm |
|
|
from transformers import AutoModelForCausalLM |
|
|
|
|
|
|
|
|
class ActivationTracker: |
|
|
def __init__(self, num_layers: int, intermediate_size: int): |
|
|
|
|
|
self.over_zero = torch.zeros( |
|
|
num_layers, intermediate_size, dtype=torch.int32, device="cpu" |
|
|
) |
|
|
|
|
|
def make_qwen_hook(self, index: int): |
|
|
over_zero = self.over_zero |
|
|
|
|
|
def qwen_forward(self, x: Tensor): |
|
|
gate_activation = self.act_fn(self.gate_proj(x)) |
|
|
with torch.no_grad(): |
|
|
over_zero[index, :] += (gate_activation > 0).sum(dim=(0, 1)).to("cpu") |
|
|
return self.down_proj(gate_activation * self.up_proj(x)) |
|
|
|
|
|
return qwen_forward |
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser() |
|
|
parser.add_argument("--model", type=str, required=True, help="HF model ID or local folder path") |
|
|
parser.add_argument("--lang", type=str, required=True, help="Language code for dataset") |
|
|
parser.add_argument("--data-path", type=str, required=True, help="Path to tokenized dataset (torch tensor)") |
|
|
parser.add_argument("--output-dir", type=str, default="activations", help="Directory to save over_zero") |
|
|
parser.add_argument("--batch-size", type=int, default=1, help="Batch size per device") |
|
|
parser.add_argument("--chunk-size", type=int, default=4096, help="Max sequence length to process at once") |
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
print(f"Using device: {device}") |
|
|
os.makedirs(args.output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
print("Loading data...") |
|
|
ids = torch.load(args.data_path, map_location="cpu") |
|
|
|
|
|
|
|
|
print(f"Loading model: {args.model}") |
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
|
args.model, |
|
|
device_map="auto", |
|
|
torch_dtype=torch.bfloat16 |
|
|
) |
|
|
model.eval() |
|
|
|
|
|
num_layers = model.config.num_hidden_layers |
|
|
intermediate_size = model.config.intermediate_size |
|
|
max_len = model.config.max_position_embeddings |
|
|
|
|
|
|
|
|
tracker = ActivationTracker(num_layers=num_layers, intermediate_size=intermediate_size) |
|
|
|
|
|
|
|
|
for i, layer in enumerate(model.model.layers): |
|
|
layer.mlp.forward = MethodType(tracker.make_qwen_hook(i), layer.mlp) |
|
|
|
|
|
|
|
|
chunk_size = min(args.chunk_size, max_len) |
|
|
n = (ids.size(0) // chunk_size) * chunk_size |
|
|
input_ids = ids[:n].reshape(-1, chunk_size) |
|
|
|
|
|
print(f"Processing {input_ids.size(0)} sequences of length {chunk_size}") |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
for i in tqdm(range(0, input_ids.size(0), args.batch_size), desc="Processing", unit="batch"): |
|
|
batch = input_ids[i:i + args.batch_size] |
|
|
|
|
|
|
|
|
|
|
|
batch = batch.to(next(model.parameters()).device) |
|
|
|
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
model(input_ids=batch) |
|
|
|
|
|
|
|
|
model_name = os.path.basename(args.model.rstrip("/")) |
|
|
out_path = os.path.join(args.output_dir, f"activation_{model_name}_{args.lang}.pt") |
|
|
torch.save(tracker.over_zero, out_path) |
|
|
print(f"Saved activation counts to {out_path}") |
|
|
print("Activation single job done") |