subhadip-rotalabs commited on
Commit
57f5aee
·
verified ·
1 Parent(s): 429b851

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. hierarchy_mistral_7b_instruct_v0.2/layer_10.json +17 -0
  2. hierarchy_mistral_7b_instruct_v0.2/layer_10.pt +3 -0
  3. hierarchy_mistral_7b_instruct_v0.2/layer_11.json +17 -0
  4. hierarchy_mistral_7b_instruct_v0.2/layer_11.pt +3 -0
  5. hierarchy_mistral_7b_instruct_v0.2/layer_12.json +17 -0
  6. hierarchy_mistral_7b_instruct_v0.2/layer_12.pt +3 -0
  7. hierarchy_mistral_7b_instruct_v0.2/layer_13.json +17 -0
  8. hierarchy_mistral_7b_instruct_v0.2/layer_13.pt +3 -0
  9. hierarchy_mistral_7b_instruct_v0.2/layer_14.json +17 -0
  10. hierarchy_mistral_7b_instruct_v0.2/layer_14.pt +3 -0
  11. hierarchy_mistral_7b_instruct_v0.2/layer_15.json +17 -0
  12. hierarchy_mistral_7b_instruct_v0.2/layer_15.pt +3 -0
  13. hierarchy_mistral_7b_instruct_v0.2/layer_16.json +17 -0
  14. hierarchy_mistral_7b_instruct_v0.2/layer_16.pt +3 -0
  15. hierarchy_mistral_7b_instruct_v0.2/layer_17.json +17 -0
  16. hierarchy_mistral_7b_instruct_v0.2/layer_17.pt +3 -0
  17. hierarchy_mistral_7b_instruct_v0.2/layer_18.json +17 -0
  18. hierarchy_mistral_7b_instruct_v0.2/layer_18.pt +3 -0
  19. hierarchy_mistral_7b_instruct_v0.2/layer_19.json +17 -0
  20. hierarchy_mistral_7b_instruct_v0.2/layer_19.pt +3 -0
  21. hierarchy_mistral_7b_instruct_v0.2/layer_20.json +17 -0
  22. hierarchy_mistral_7b_instruct_v0.2/layer_20.pt +3 -0
  23. hierarchy_mistral_7b_instruct_v0.2/metadata.json +89 -0
  24. hierarchy_qwen3_8b/layer_12.json +17 -0
  25. hierarchy_qwen3_8b/layer_12.pt +3 -0
  26. hierarchy_qwen3_8b/layer_13.json +17 -0
  27. hierarchy_qwen3_8b/layer_13.pt +3 -0
  28. hierarchy_qwen3_8b/layer_14.json +17 -0
  29. hierarchy_qwen3_8b/layer_14.pt +3 -0
  30. hierarchy_qwen3_8b/layer_15.json +17 -0
  31. hierarchy_qwen3_8b/layer_15.pt +3 -0
  32. hierarchy_qwen3_8b/layer_16.json +17 -0
  33. hierarchy_qwen3_8b/layer_16.pt +3 -0
  34. hierarchy_qwen3_8b/layer_17.json +17 -0
  35. hierarchy_qwen3_8b/layer_17.pt +3 -0
  36. hierarchy_qwen3_8b/layer_18.json +17 -0
  37. hierarchy_qwen3_8b/layer_18.pt +3 -0
  38. hierarchy_qwen3_8b/layer_19.json +17 -0
  39. hierarchy_qwen3_8b/layer_19.pt +3 -0
  40. hierarchy_qwen3_8b/layer_20.json +17 -0
  41. hierarchy_qwen3_8b/layer_20.pt +3 -0
  42. hierarchy_qwen3_8b/layer_21.json +17 -0
  43. hierarchy_qwen3_8b/layer_21.pt +3 -0
  44. hierarchy_qwen3_8b/layer_22.json +17 -0
  45. hierarchy_qwen3_8b/layer_22.pt +3 -0
  46. hierarchy_qwen3_8b/layer_23.json +17 -0
  47. hierarchy_qwen3_8b/layer_23.pt +3 -0
  48. hierarchy_qwen3_8b/metadata.json +96 -0
  49. refusal_gemma_2_9b_it/layer_14.json +17 -0
  50. refusal_gemma_2_9b_it/layer_14.pt +3 -0
hierarchy_mistral_7b_instruct_v0.2/layer_10.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 10,
4
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 2.21875,
10
+ "neg_mean_norm": 1.71875,
11
+ "vector_norm": 1.171875,
12
+ "created_at": "2025-12-18T07:56:46.687323"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_mistral_7b_instruct_v0.2/layer_10.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1006519eb5b358f74bf30a99c8c627f724ffce4df650344d5598b584f58f97a2
3
+ size 9776
hierarchy_mistral_7b_instruct_v0.2/layer_11.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 11,
4
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 2.515625,
10
+ "neg_mean_norm": 1.9453125,
11
+ "vector_norm": 1.375,
12
+ "created_at": "2025-12-18T07:56:59.172719"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_mistral_7b_instruct_v0.2/layer_11.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ed8a8c8648eb0b563ee51a3c12fa9c0621e04a5971f25d0a59fd748efda92ea
3
+ size 9776
hierarchy_mistral_7b_instruct_v0.2/layer_12.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 12,
4
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 2.796875,
10
+ "neg_mean_norm": 2.15625,
11
+ "vector_norm": 1.59375,
12
+ "created_at": "2025-12-18T07:57:11.731605"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_mistral_7b_instruct_v0.2/layer_12.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:250f67ef20cea1c35b0bff4fc053af114810620e981c6e58f4ae8ebf7d9ce7dd
3
+ size 9776
hierarchy_mistral_7b_instruct_v0.2/layer_13.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 13,
4
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 3.15625,
10
+ "neg_mean_norm": 2.515625,
11
+ "vector_norm": 1.875,
12
+ "created_at": "2025-12-18T07:57:24.201189"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_mistral_7b_instruct_v0.2/layer_13.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4f2c18c7d76db7d6ad80f5b17f562066b71d51e26634501d67c4f585a7d2e1e6
3
+ size 9776
hierarchy_mistral_7b_instruct_v0.2/layer_14.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 14,
4
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 3.46875,
10
+ "neg_mean_norm": 2.75,
11
+ "vector_norm": 2.171875,
12
+ "created_at": "2025-12-18T07:57:36.698532"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_mistral_7b_instruct_v0.2/layer_14.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b1d114124e531ea8bca68532d225a6e4f39cdfe0a634aa83fac6b9f170d9a3a
3
+ size 9776
hierarchy_mistral_7b_instruct_v0.2/layer_15.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 15,
4
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 4.0,
10
+ "neg_mean_norm": 3.203125,
11
+ "vector_norm": 2.515625,
12
+ "created_at": "2025-12-18T07:57:49.241757"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_mistral_7b_instruct_v0.2/layer_15.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c525b49a11073bf3cc35e581882eb059cee3eb90337ab98ec402fbd255b1e6e0
3
+ size 9776
hierarchy_mistral_7b_instruct_v0.2/layer_16.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 16,
4
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 4.5625,
10
+ "neg_mean_norm": 3.8125,
11
+ "vector_norm": 2.796875,
12
+ "created_at": "2025-12-18T07:58:01.868522"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_mistral_7b_instruct_v0.2/layer_16.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e44de7b7d62c10a13d54eaca978cbabf5a0e5b45b88b0341b9a76b9e0c4caa0e
3
+ size 9776
hierarchy_mistral_7b_instruct_v0.2/layer_17.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 17,
4
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 4.96875,
10
+ "neg_mean_norm": 4.21875,
11
+ "vector_norm": 3.25,
12
+ "created_at": "2025-12-18T07:58:14.433875"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_mistral_7b_instruct_v0.2/layer_17.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02714212448839254a5283e4fab3815a3421f790ca793b6c6622b5301a78b604
3
+ size 9776
hierarchy_mistral_7b_instruct_v0.2/layer_18.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 18,
4
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 5.75,
10
+ "neg_mean_norm": 5.03125,
11
+ "vector_norm": 3.671875,
12
+ "created_at": "2025-12-18T07:58:26.994268"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_mistral_7b_instruct_v0.2/layer_18.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2716a6dc3e5be95d59e5f6d3087ef86b4ea6f89432312b3297e2f1202135bd09
3
+ size 9776
hierarchy_mistral_7b_instruct_v0.2/layer_19.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 19,
4
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 6.375,
10
+ "neg_mean_norm": 5.65625,
11
+ "vector_norm": 4.03125,
12
+ "created_at": "2025-12-18T07:58:39.596522"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_mistral_7b_instruct_v0.2/layer_19.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a2891da9226cf5e7848205990cb14a1dbb93a154f0f72bc6a5ad9b136377c03
3
+ size 9776
hierarchy_mistral_7b_instruct_v0.2/layer_20.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 20,
4
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 7.09375,
10
+ "neg_mean_norm": 6.375,
11
+ "vector_norm": 4.40625,
12
+ "created_at": "2025-12-18T07:58:52.211449"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_mistral_7b_instruct_v0.2/layer_20.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f21a277acc94df368c916e79ba65afdf98c26aa4f8e4e249bcf6985bbf701243
3
+ size 9776
hierarchy_mistral_7b_instruct_v0.2/metadata.json ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "hierarchy",
3
+ "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
4
+ "layers": [
5
+ 10,
6
+ 11,
7
+ 12,
8
+ 13,
9
+ 14,
10
+ 15,
11
+ 16,
12
+ 17,
13
+ 18,
14
+ 19,
15
+ 20
16
+ ],
17
+ "best_layer": 15,
18
+ "best_improvement": 0.25,
19
+ "num_pairs": 26,
20
+ "target_direction": "increase",
21
+ "layer_results": {
22
+ "10": {
23
+ "0.0": 0.5,
24
+ "0.5": 0.75,
25
+ "1.0": 0.5,
26
+ "1.5": 0.5
27
+ },
28
+ "11": {
29
+ "0.0": 0.5,
30
+ "0.5": 0.5,
31
+ "1.0": 0.5,
32
+ "1.5": 0.5
33
+ },
34
+ "12": {
35
+ "0.0": 0.5,
36
+ "0.5": 0.75,
37
+ "1.0": 0.25,
38
+ "1.5": 0.5
39
+ },
40
+ "13": {
41
+ "0.0": 0.5,
42
+ "0.5": 0.5,
43
+ "1.0": 0.25,
44
+ "1.5": 0.5
45
+ },
46
+ "14": {
47
+ "0.0": 0.5,
48
+ "0.5": 0.5,
49
+ "1.0": 0.5,
50
+ "1.5": 0.25
51
+ },
52
+ "15": {
53
+ "0.0": 0.5,
54
+ "0.5": 0.25,
55
+ "1.0": 0.75,
56
+ "1.5": 0.5
57
+ },
58
+ "16": {
59
+ "0.0": 0.5,
60
+ "0.5": 0.5,
61
+ "1.0": 0.5,
62
+ "1.5": 0.25
63
+ },
64
+ "17": {
65
+ "0.0": 0.5,
66
+ "0.5": 0.5,
67
+ "1.0": 0.75,
68
+ "1.5": 0.0
69
+ },
70
+ "18": {
71
+ "0.0": 0.5,
72
+ "0.5": 0.75,
73
+ "1.0": 0.75,
74
+ "1.5": 0.5
75
+ },
76
+ "19": {
77
+ "0.0": 0.5,
78
+ "0.5": 0.5,
79
+ "1.0": 0.75,
80
+ "1.5": 0.75
81
+ },
82
+ "20": {
83
+ "0.0": 0.5,
84
+ "0.5": 0.75,
85
+ "1.0": 0.75,
86
+ "1.5": 0.25
87
+ }
88
+ }
89
+ }
hierarchy_qwen3_8b/layer_12.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 12,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 57.5,
10
+ "neg_mean_norm": 53.0,
11
+ "vector_norm": 16.5,
12
+ "created_at": "2025-12-18T22:23:00.548285"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_12.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce51034b11b188669eec43728629bb4910b7a172f4c1096bda675c8f4e8f29c1
3
+ size 9776
hierarchy_qwen3_8b/layer_13.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 13,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 57.75,
10
+ "neg_mean_norm": 52.5,
11
+ "vector_norm": 17.375,
12
+ "created_at": "2025-12-18T22:23:13.018805"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_13.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6e78111385cbc647ba93a926857ec0728defacc450004089399612b4040e6ee
3
+ size 9776
hierarchy_qwen3_8b/layer_14.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 14,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 64.0,
10
+ "neg_mean_norm": 59.5,
11
+ "vector_norm": 18.25,
12
+ "created_at": "2025-12-18T22:23:25.489920"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_14.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77e0c227409a6cf799777c3c062be3f8de3bb502486fd4d9a8ac8ad0787b7f06
3
+ size 9776
hierarchy_qwen3_8b/layer_15.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 15,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 64.5,
10
+ "neg_mean_norm": 59.25,
11
+ "vector_norm": 20.125,
12
+ "created_at": "2025-12-18T22:23:37.918916"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_15.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c52325a7cb27e8b27546652ff6ec96617898b552f3bdc3f35e2588277bac3e22
3
+ size 9776
hierarchy_qwen3_8b/layer_16.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 16,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 70.5,
10
+ "neg_mean_norm": 66.0,
11
+ "vector_norm": 22.75,
12
+ "created_at": "2025-12-18T22:23:50.342735"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_16.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f63575f4b03adaca359a235936cad5def59645b5c633eee43a605a3fdc3bb7b4
3
+ size 9776
hierarchy_qwen3_8b/layer_17.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 17,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 79.0,
10
+ "neg_mean_norm": 72.5,
11
+ "vector_norm": 24.875,
12
+ "created_at": "2025-12-18T22:24:02.740801"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_17.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c430e797d25d7da3ae75e29dfb69c9f1086d4b598e1b3c2731fb5b3deb8c0af
3
+ size 9776
hierarchy_qwen3_8b/layer_18.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 18,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 82.0,
10
+ "neg_mean_norm": 76.0,
11
+ "vector_norm": 28.5,
12
+ "created_at": "2025-12-18T22:24:15.178867"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_18.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:413c094daf3f75329afca04e1ef3669d5eabaae2cb96f67ba3d95d8c2c771058
3
+ size 9776
hierarchy_qwen3_8b/layer_19.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 19,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 95.5,
10
+ "neg_mean_norm": 88.5,
11
+ "vector_norm": 35.5,
12
+ "created_at": "2025-12-18T22:24:27.615196"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_19.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:935c01ff2ecd1b7ca76b7216201d9d0aca1ec92d335655547226132418baa919
3
+ size 9776
hierarchy_qwen3_8b/layer_20.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 20,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 102.0,
10
+ "neg_mean_norm": 97.5,
11
+ "vector_norm": 38.0,
12
+ "created_at": "2025-12-18T22:24:40.132334"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_20.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b33e968ee60893fceb743e97b43585079759340346a0446e6d4b98f28b84efb7
3
+ size 9776
hierarchy_qwen3_8b/layer_21.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 21,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 106.5,
10
+ "neg_mean_norm": 103.0,
11
+ "vector_norm": 41.75,
12
+ "created_at": "2025-12-18T22:24:52.616517"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_21.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:355eb6c12243de94f3e020b5d7bf8c512b93f5fef321c4f3af2695c68b5133a5
3
+ size 9776
hierarchy_qwen3_8b/layer_22.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 22,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 131.0,
10
+ "neg_mean_norm": 127.0,
11
+ "vector_norm": 50.5,
12
+ "created_at": "2025-12-18T22:25:05.048245"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_22.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:580cf4c17cb42b36970276ea01be1d4fe720bf4342e179c5eff2c4d31a09497c
3
+ size 9776
hierarchy_qwen3_8b/layer_23.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "instruction_hierarchy",
3
+ "layer_index": 23,
4
+ "model_name": "Qwen/Qwen3-8B",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 26,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 159.0,
10
+ "neg_mean_norm": 156.0,
11
+ "vector_norm": 62.25,
12
+ "created_at": "2025-12-18T22:25:17.482916"
13
+ },
14
+ "vector_shape": [
15
+ 4096
16
+ ]
17
+ }
hierarchy_qwen3_8b/layer_23.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4cf2b7d9c494448f6350f792048fcd8cce2e810b9e24c4bde228970c18bf3d3c
3
+ size 9776
hierarchy_qwen3_8b/metadata.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "hierarchy",
3
+ "model_name": "Qwen/Qwen3-8B",
4
+ "layers": [
5
+ 12,
6
+ 13,
7
+ 14,
8
+ 15,
9
+ 16,
10
+ 17,
11
+ 18,
12
+ 19,
13
+ 20,
14
+ 21,
15
+ 22,
16
+ 23
17
+ ],
18
+ "best_layer": 14,
19
+ "best_improvement": 0.25,
20
+ "num_pairs": 26,
21
+ "target_direction": "increase",
22
+ "layer_results": {
23
+ "12": {
24
+ "0.0": 0.0,
25
+ "0.5": 0.25,
26
+ "1.0": 0.0,
27
+ "1.5": 0.25
28
+ },
29
+ "13": {
30
+ "0.0": 0.0,
31
+ "0.5": 0.25,
32
+ "1.0": 0.0,
33
+ "1.5": 0.0
34
+ },
35
+ "14": {
36
+ "0.0": 0.0,
37
+ "0.5": 0.25,
38
+ "1.0": 0.25,
39
+ "1.5": 0.0
40
+ },
41
+ "15": {
42
+ "0.0": 0.0,
43
+ "0.5": 0.25,
44
+ "1.0": 0.25,
45
+ "1.5": 0.5
46
+ },
47
+ "16": {
48
+ "0.0": 0.0,
49
+ "0.5": 0.0,
50
+ "1.0": 0.0,
51
+ "1.5": 0.0
52
+ },
53
+ "17": {
54
+ "0.0": 0.0,
55
+ "0.5": 0.25,
56
+ "1.0": 0.25,
57
+ "1.5": 0.0
58
+ },
59
+ "18": {
60
+ "0.0": 0.0,
61
+ "0.5": 0.25,
62
+ "1.0": 0.0,
63
+ "1.5": 0.25
64
+ },
65
+ "19": {
66
+ "0.0": 0.0,
67
+ "0.5": 0.0,
68
+ "1.0": 0.0,
69
+ "1.5": 0.0
70
+ },
71
+ "20": {
72
+ "0.0": 0.0,
73
+ "0.5": 0.25,
74
+ "1.0": 0.25,
75
+ "1.5": 0.5
76
+ },
77
+ "21": {
78
+ "0.0": 0.0,
79
+ "0.5": 0.25,
80
+ "1.0": 0.0,
81
+ "1.5": 0.0
82
+ },
83
+ "22": {
84
+ "0.0": 0.0,
85
+ "0.5": 0.0,
86
+ "1.0": 0.25,
87
+ "1.5": 0.5
88
+ },
89
+ "23": {
90
+ "0.0": 0.0,
91
+ "0.5": 0.0,
92
+ "1.0": 0.25,
93
+ "1.5": 0.25
94
+ }
95
+ }
96
+ }
refusal_gemma_2_9b_it/layer_14.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "behavior": "refusal",
3
+ "layer_index": 14,
4
+ "model_name": "google/gemma-2-9b-it",
5
+ "extraction_method": "caa",
6
+ "metadata": {
7
+ "num_pairs": 50,
8
+ "token_position": "last",
9
+ "pos_mean_norm": 161.0,
10
+ "neg_mean_norm": 147.625,
11
+ "vector_norm": 115.375,
12
+ "created_at": "2025-12-18T00:57:55.073309"
13
+ },
14
+ "vector_shape": [
15
+ 3584
16
+ ]
17
+ }
refusal_gemma_2_9b_it/layer_14.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd28df76df8f95e46b419d51abe7b2372be1b1c8676a9224d5bfeeae42f4ee67
3
+ size 8752