arubique commited on
Commit
df264fd
·
verified ·
1 Parent(s): 82fd9e2

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "disco",
3
+ "auto_map": {
4
+ "AutoConfig": "configuration_disco.DiscoConfig",
5
+ "AutoModel": "modeling_disco.DiscoPredictor"
6
+ },
7
+ "n_components": 256,
8
+ "sampling_name": "high-disagreement@100+nonstratified",
9
+ "number_item": "100",
10
+ "fitted_model_type": "RandomForestRegressor_100"
11
+ }
configuration_disco.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 MASEval contributors. DISCO predictor config for Hugging Face Hub.
2
+
3
+ from transformers import PreTrainedConfig
4
+
5
+
6
+ class DiscoConfig(PreTrainedConfig):
7
+ """Configuration for DISCO predictor (PCA + Random Forest) on the Hub."""
8
+
9
+ model_type = "disco"
10
+
11
+ def __init__(
12
+ self,
13
+ n_components: int = 256,
14
+ sampling_name: str = "",
15
+ number_item: str = "",
16
+ fitted_model_type: str = "",
17
+ **kwargs,
18
+ ):
19
+ super().__init__(**kwargs)
20
+ self.n_components = n_components
21
+ self.sampling_name = sampling_name
22
+ self.number_item = number_item
23
+ self.fitted_model_type = fitted_model_type
disco_meta.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "sampling_name": "high-disagreement@100+nonstratified",
3
+ "number_item": "100",
4
+ "fitted_model_type": "RandomForestRegressor_100"
5
+ }
disco_model.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aa3ff6702bf4044dd2d40bd8a137210f117012fa17032831ef41ac851f2e548
3
+ size 1338140
disco_transform.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4905d78a7279cb94cee302a4a751a2109f44dd8129b6cc6bf6dbc846a226f2e
3
+ size 6374652
modeling_disco.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2025 MASEval contributors. DISCO predictor model for Hugging Face Hub.
2
+ #
3
+ # Self-contained: uses only numpy and huggingface_hub. Load with:
4
+ # from transformers import AutoModel
5
+ # model = AutoModel.from_pretrained("<USERNAME>/my-disco-mmlu", trust_remote_code=True)
6
+ # acc = model.predict(predictions_tensor) # predictions: (n_models, n_anchor_points, n_classes)
7
+
8
+ from pathlib import Path
9
+ from typing import Optional, Union
10
+
11
+ import numpy as np
12
+
13
+
14
+ def _pca_transform(X: np.ndarray, components: np.ndarray, mean: np.ndarray) -> np.ndarray:
15
+ """Apply PCA transform: (X - mean) @ components.T."""
16
+ return (X - mean) @ components.T
17
+
18
+
19
+ def _predict_tree(
20
+ X: np.ndarray,
21
+ children_left: np.ndarray,
22
+ children_right: np.ndarray,
23
+ feature: np.ndarray,
24
+ threshold: np.ndarray,
25
+ value: np.ndarray,
26
+ ) -> np.ndarray:
27
+ """Predict for one tree; X (n_samples, n_features) -> (n_samples,)."""
28
+ out = np.empty(X.shape[0], dtype=np.float64)
29
+ for i in range(X.shape[0]):
30
+ node = 0
31
+ while children_left[node] != -1:
32
+ if X[i, feature[node]] <= threshold[node]:
33
+ node = children_left[node]
34
+ else:
35
+ node = children_right[node]
36
+ out[i] = value[node]
37
+ return out
38
+
39
+
40
+ def _predict_rf(
41
+ X: np.ndarray,
42
+ tree_node_counts: np.ndarray,
43
+ children_left: np.ndarray,
44
+ children_right: np.ndarray,
45
+ feature: np.ndarray,
46
+ threshold: np.ndarray,
47
+ value: np.ndarray,
48
+ ) -> np.ndarray:
49
+ """Predict using RF tree arrays; X (n_samples, n_features) -> (n_samples,)."""
50
+ offsets = np.concatenate([[0], np.cumsum(tree_node_counts)])
51
+ n_trees = len(tree_node_counts)
52
+ preds = np.zeros((n_trees, X.shape[0]), dtype=np.float64)
53
+ for t in range(n_trees):
54
+ lo, hi = offsets[t], offsets[t + 1]
55
+ preds[t] = _predict_tree(
56
+ X,
57
+ children_left[lo:hi],
58
+ children_right[lo:hi],
59
+ feature[lo:hi],
60
+ threshold[lo:hi],
61
+ value[lo:hi],
62
+ )
63
+ return np.mean(preds, axis=0)
64
+
65
+
66
+ class DiscoPredictor:
67
+ """
68
+ DISCO predictor: maps anchor-point prediction tensors to full-benchmark accuracy.
69
+
70
+ Load from the Hub with:
71
+ from transformers import AutoModel
72
+ model = AutoModel.from_pretrained("<USERNAME>/my-disco-mmlu", trust_remote_code=True)
73
+
74
+ Then call model.predict(predictions) where predictions has shape
75
+ (n_models, n_anchor_points, n_classes) (e.g. log-probabilities per choice).
76
+ Returns a 1D array of predicted full-benchmark accuracies, one per model.
77
+ """
78
+
79
+ def __init__(
80
+ self,
81
+ components: np.ndarray,
82
+ mean: np.ndarray,
83
+ tree_node_counts: np.ndarray,
84
+ children_left: np.ndarray,
85
+ children_right: np.ndarray,
86
+ feature: np.ndarray,
87
+ threshold: np.ndarray,
88
+ value: np.ndarray,
89
+ config: Optional["DiscoConfig"] = None,
90
+ ):
91
+ self._components = np.asarray(components, dtype=np.float64)
92
+ self._mean = np.asarray(mean, dtype=np.float64)
93
+ self._tree_node_counts = np.asarray(tree_node_counts, dtype=np.int64)
94
+ self._children_left = np.asarray(children_left, dtype=np.int32)
95
+ self._children_right = np.asarray(children_right, dtype=np.int32)
96
+ self._feature = np.asarray(feature, dtype=np.int32)
97
+ self._threshold = np.asarray(threshold, dtype=np.float64)
98
+ self._value = np.asarray(value, dtype=np.float64)
99
+ self.config = config
100
+
101
+ @classmethod
102
+ def from_pretrained(
103
+ cls,
104
+ pretrained_model_name_or_path: Union[str, Path],
105
+ **kwargs,
106
+ ) -> "DiscoPredictor":
107
+ """Load DISCO weights from a Hugging Face repo or local directory."""
108
+ try:
109
+ from huggingface_hub import snapshot_download
110
+ except ImportError as e:
111
+ raise ImportError("Loading from Hub requires huggingface_hub: pip install huggingface_hub") from e
112
+
113
+ path = Path(pretrained_model_name_or_path)
114
+ if not path.exists() or not path.is_dir():
115
+ path = Path(snapshot_download(pretrained_model_name_or_path))
116
+
117
+ # Load config if present
118
+ config = None
119
+ config_path = path / "config.json"
120
+ if config_path.exists():
121
+ try:
122
+ from transformers import AutoConfig
123
+
124
+ config = AutoConfig.from_pretrained(str(path), trust_remote_code=True)
125
+ except Exception:
126
+ pass
127
+
128
+ # Load PCA (transform)
129
+ transform_data = np.load(path / "disco_transform.npz")
130
+ components = np.asarray(transform_data["components_"])
131
+ mean = np.asarray(transform_data["mean_"])
132
+
133
+ # Load RF (model)
134
+ model_data = np.load(path / "disco_model.npz")
135
+ tree_node_counts = np.asarray(model_data["tree_node_counts"], dtype=np.int64)
136
+ children_left = np.asarray(model_data["children_left"], dtype=np.int32)
137
+ children_right = np.asarray(model_data["children_right"], dtype=np.int32)
138
+ feature = np.asarray(model_data["feature"], dtype=np.int32)
139
+ threshold = np.asarray(model_data["threshold"], dtype=np.float64)
140
+ value = np.asarray(model_data["value"], dtype=np.float64)
141
+
142
+ return cls(
143
+ components=components,
144
+ mean=mean,
145
+ tree_node_counts=tree_node_counts,
146
+ children_left=children_left,
147
+ children_right=children_right,
148
+ feature=feature,
149
+ threshold=threshold,
150
+ value=value,
151
+ config=config,
152
+ )
153
+
154
+ def predict(
155
+ self,
156
+ predictions: np.ndarray,
157
+ apply_softmax: bool = True,
158
+ ) -> np.ndarray:
159
+ """
160
+ Predict full-benchmark accuracy from anchor-point predictions.
161
+
162
+ Args:
163
+ predictions: Shape (n_models, n_anchor_points, n_classes), e.g. log-probabilities.
164
+ apply_softmax: If True, apply softmax to predictions before PCA (default True).
165
+
166
+ Returns:
167
+ Shape (n_models,) predicted full-benchmark accuracies.
168
+ """
169
+ X = np.asarray(predictions, dtype=np.float64)
170
+ if X.ndim == 2:
171
+ X = X[np.newaxis, ...]
172
+ n_models = X.shape[0]
173
+ # Softmax over last dim
174
+ if apply_softmax:
175
+ X = np.exp(X - X.max(axis=-1, keepdims=True))
176
+ X = X / X.sum(axis=-1, keepdims=True)
177
+ # Flatten to (n_models, n_anchor_points * n_classes)
178
+ X = X.reshape(n_models, -1)
179
+ # PCA
180
+ emb = _pca_transform(X, self._components, self._mean)
181
+ # RF
182
+ return _predict_rf(
183
+ emb,
184
+ self._tree_node_counts,
185
+ self._children_left,
186
+ self._children_right,
187
+ self._feature,
188
+ self._threshold,
189
+ self._value,
190
+ )
191
+
192
+ def save_pretrained(self, save_directory: Union[str, Path]) -> None:
193
+ """Save DISCO weights and config to a directory (e.g. for uploading to Hub)."""
194
+ from transformers import AutoConfig
195
+
196
+ path = Path(save_directory)
197
+ path.mkdir(parents=True, exist_ok=True)
198
+ np.savez(
199
+ path / "disco_transform.npz",
200
+ components_=self._components,
201
+ mean_=self._mean,
202
+ )
203
+ np.savez(
204
+ path / "disco_model.npz",
205
+ tree_node_counts=self._tree_node_counts,
206
+ children_left=self._children_left,
207
+ children_right=self._children_right,
208
+ feature=self._feature,
209
+ threshold=self._threshold,
210
+ value=self._value,
211
+ )
212
+ if self.config is not None:
213
+ self.config.save_pretrained(save_directory)