IvoHoese commited on
Commit
0533c5e
·
verified ·
1 Parent(s): a8e550b

Upload 2 files

Browse files
Files changed (2) hide show
  1. task_template.py +314 -0
  2. utils.py +232 -0
task_template.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import pandas as pd
3
+ import requests
4
+ import sys
5
+ import torchvision.models as models
6
+ import os
7
+
8
+ from transformers import AutoTokenizer, PreTrainedModel, AutoModelForSequenceClassification
9
+
10
+ import utils
11
+
12
+ # --------------------------------
13
+ # DATASET
14
+ # --------------------------------
15
+
16
+ """
17
+ Dataset contents:
18
+
19
+ - 1000 subsets of text data, each subset stored under the key "subset_{i}" where i ranges from 0 to 999.
20
+ Each subset is a dictionary with:
21
+ -"prompts": List of 100 prompts in the subset
22
+ -"labels": Tensor of true labels for the prompts in the subset, has shape (100)
23
+ -"subset_id": Integer ID of the subset (from 0 to 999)
24
+ """
25
+
26
+ # Load the dataset
27
+ dataset = torch.load("fulltuning.pt")
28
+
29
+ # Example: Acessing subsets
30
+ subset_0 = dataset["subset_0"]
31
+
32
+ print("Subset 0 keys:", subset_0.keys())
33
+ print("Subset ID:", subset_0["subset_id"])
34
+ print("Labels length:", len(subset_0["labels"]))
35
+ print("First prompts:", subset_0["prompts"][:5])
36
+ print("First 5 labels:", subset_0["labels"][:5])
37
+
38
+ # --------------------------------
39
+ # QUERYING THE CLASSIFIER
40
+ # --------------------------------
41
+
42
+ # This Code can be used to load and query the fully fine-tuned models. You also need to the available utils.py file.
43
+
44
+ #|---------------------------------------------------------------------------------------------------|
45
+ #| NOTE: "Missing or unexpected params" warnings are no reason for concern. They stem from the |
46
+ #| fact that the model is first loaded without a classifier head, which is added afterwards. |
47
+ #|---------------------------------------------------------------------------------------------------|
48
+
49
+ # Use this tokenizer for OLMO...
50
+
51
+ tokenizer = AutoTokenizer.from_pretrained("allenai/OLMo-1B-hf", trust_remote_code=True)
52
+
53
+ # ...and this one for Pythia
54
+
55
+ tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-410m", trust_remote_code=True)
56
+
57
+
58
+ tokenizer.padding_side = "left"
59
+
60
+ if tokenizer.pad_token is None:
61
+ tokenizer.pad_token = tokenizer.eos_token
62
+
63
+
64
+ # Usage example (fulltuning):
65
+
66
+ model_path = "olmo-fulltuning"
67
+
68
+ model = utils.get_fulltuning_model(model_path, model_type="olmo") # model_type can be "olmo" or "pythia"
69
+
70
+ example_prompt = "I think, therefore I am.\n\nI am."
71
+
72
+ inputs = tokenizer(example_prompt, return_tensors="pt", truncation=True)
73
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
74
+ with torch.no_grad():
75
+ outputs = model(**inputs)
76
+
77
+ logits = outputs.logits
78
+
79
+ print(f"Logits shape: {logits.shape}")
80
+ print(f"Logits: {logits}")
81
+
82
+
83
+
84
+ # Usage example (softprompt):
85
+
86
+ model_path = "olmo-softprompt"
87
+
88
+ model = utils.get_peft_model(model_path, model_type="olmo") # model_type can be "olmo" or "pythia"
89
+
90
+ example_prompt = "I think, but do I exist?\n\nSince you think, you exist."
91
+
92
+ inputs = tokenizer(example_prompt, return_tensors="pt", truncation=True)
93
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
94
+ with torch.no_grad():
95
+ outputs = model(**inputs)
96
+
97
+ logits = outputs.logits
98
+
99
+ print(f"Logits shape: {logits.shape}")
100
+ print(f"Logits: {logits}")
101
+
102
+
103
+
104
+ # Usage example (lora):
105
+
106
+ model_path = "olmo-lora"
107
+
108
+ model = utils.get_peft_model(model_path, model_type="olmo") # model_type can be "olmo" or "pythia"
109
+
110
+ example_prompt = "Who am I?\n\nWhat am I?"
111
+
112
+ inputs = tokenizer(example_prompt, return_tensors="pt", truncation=True)
113
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
114
+ with torch.no_grad():
115
+ outputs = model(**inputs)
116
+
117
+ logits = outputs.logits
118
+
119
+ print(f"Logits shape: {logits.shape}")
120
+ print(f"Logits: {logits}")
121
+
122
+
123
+
124
+ # Usage example (lastlayer):
125
+
126
+ model_path = "olmo-lastlayer"
127
+
128
+ model = utils.get_peft_model(model_path, model_type="olmo") # model_type can be "olmo" or "pythia"
129
+
130
+ example_prompt = "I love to exist!"
131
+
132
+ inputs = tokenizer(example_prompt, return_tensors="pt", truncation=True)
133
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
134
+ with torch.no_grad():
135
+ outputs = model(**inputs)
136
+
137
+ logits = outputs.logits
138
+
139
+ print(f"Logits shape: {logits.shape}")
140
+ print(f"Logits: {logits}")
141
+
142
+
143
+
144
+ # Usage example (prefix):
145
+
146
+ model_path = "olmo-prefix"
147
+
148
+ model = utils.get_peft_model(model_path, model_type="olmo") # model_type can be "olmo" or "pythia"
149
+
150
+ example_prompt = "I will exist yesterday."
151
+
152
+ inputs = tokenizer(example_prompt, return_tensors="pt", truncation=True)
153
+ inputs = {k: v.to(model.device) for k, v in inputs.items()}
154
+ with torch.no_grad():
155
+ outputs = utils.forward_peft_seqcls(model, **inputs)
156
+
157
+ logits = outputs.logits
158
+
159
+ print(f"Logits shape: {logits.shape}")
160
+ print(f"Logits: {logits}")
161
+
162
+ # --------------------------------
163
+ # SUBMISSION FORMAT
164
+ # --------------------------------
165
+
166
+ """
167
+ The submission must be a .csv file with the following format:
168
+
169
+ -"type": Name of the model (e.g., "softprompt", "fulltuning", etc.)
170
+ -"subset_id": ID of the subset (from 0 to 999, per type)
171
+ -"membership": Membership score for each subset (float)
172
+ """
173
+
174
+ # Example Submission:
175
+
176
+ types = ["softprompt", "fulltuning", "lora", "lastlayer", "prefix"]
177
+ type_list = []
178
+
179
+ for t in types:
180
+ type_list.extend([t] * 1000)
181
+
182
+ subset_ids = []
183
+ for _ in types:
184
+ subset_ids.extend(list(range(1000)))
185
+
186
+ membership_scores = torch.rand(5000).tolist()
187
+ submission_df = pd.DataFrame({
188
+ "type": type_list,
189
+ "subset_id": subset_ids,
190
+ "membership": membership_scores
191
+ })
192
+ submission_df.to_csv("example_submission.csv", index=None)
193
+
194
+ # --------------------------------
195
+ # SUBMISSION PROCESS
196
+ # --------------------------------
197
+
198
+ """
199
+ Example submission script for the LLM Dataset Membership Inference Task.
200
+
201
+ Submission Requirements (read carefully to avoid automatic rejection):
202
+
203
+ 1. CSV FORMAT
204
+ ----------------
205
+ - The file **must be a CSV** with extension `.csv`.
206
+ - It must contain **exactly three columns**, named:
207
+ type, subset_id, membership
208
+ → Column names must match exactly (lowercase, no extra spaces).
209
+ → Column order does not matter, but all three must be present.
210
+
211
+ 2. ROW COUNT AND IDENTIFIERS
212
+ -------------------------------
213
+ - Your file must contain **exactly 5000 rows**.
214
+ - Each row corresponds to one unique `subset_id`/`type` pair, with ids in the range **0–999** (inclusive).
215
+ - Every subset_id must appear **exactly once** for each type.
216
+ - Do **not** add, remove, or rename any IDs.
217
+ - Do **not** include duplicates or missing entries.
218
+ - The evaluator checks:
219
+ subset_id.min() == 0
220
+ subset_id.max() == 999
221
+ subset_id.unique().size == 1000
222
+
223
+ 3. MEMBERSHIP SCORES
224
+ ----------------------
225
+ - The `membership` column must contain **numeric values** representing your model’s predicted confidence
226
+ that the corresponding subset is a **member** of the training set.
227
+
228
+ Examples of valid membership values:
229
+ - Probabilities: values in [0.0, 1.0]
230
+ - Raw model scores: any finite numeric values (will be ranked for TPR@FPR=0.05)
231
+
232
+ - Do **not** submit string labels like "yes"/"no" or "member"/"non-member".
233
+ - The evaluator converts your `membership` column to numeric using `pd.to_numeric()`.
234
+ → Any non-numeric, NaN, or infinite entries will cause automatic rejection.
235
+
236
+ 4. TECHNICAL LIMITS
237
+ ----------------------
238
+ - Maximum file size: **20 MB**
239
+ - Encoding: UTF-8 recommended.
240
+ - Avoid extra columns, blank lines, or formulas.
241
+ - Ensure all values are numeric and finite.
242
+ - Supported data types: int, float (e.g., float32, float64)
243
+
244
+ 5. VALIDATION SUMMARY
245
+ ------------------------
246
+ Your submission will fail if:
247
+ - Columns don’t match exactly ("type", "subset_id", "membership")
248
+ - Row count differs from 5000
249
+ - Any type name is unexpected or not in the allowed set
250
+ - Any subset_id is missing, duplicated, or outside [0, 999] for any type
251
+ - Any membership value is NaN, Inf, or non-numeric
252
+ - File is too large or not a valid CSV
253
+
254
+ Two key metrics are computed:
255
+ 1. **ROC-AUC (Area Under the ROC Curve)** — measures overall discriminative ability.
256
+ 2. **TPR@FPR=0.05** — true positive rate when the false positive rate is at 5%.
257
+
258
+ """
259
+
260
+ BASE_URL = "http://35.192.205.84:80"
261
+ API_KEY = "77ab4a807d03d48e1d85881b27496871" # replace with your actual API key
262
+
263
+ TASK_ID = "14-llm-dataset-inference"
264
+ FILE_PATH = "example_submission.csv" # replace with your actual file path
265
+
266
+ SUBMIT = False # Set to True to enable submission
267
+
268
+ def die(msg):
269
+ print(f"{msg}", file=sys.stderr)
270
+ sys.exit(1)
271
+
272
+ if SUBMIT:
273
+ if not os.path.isfile(FILE_PATH):
274
+ die(f"File not found: {FILE_PATH}")
275
+
276
+ try:
277
+ with open(FILE_PATH, "rb") as f:
278
+ files = {
279
+ # (fieldname) -> (filename, fileobj, content_type)
280
+ "file": (os.path.basename(FILE_PATH), f, "csv"),
281
+ }
282
+ resp = requests.post(
283
+ f"{BASE_URL}/submit/{TASK_ID}",
284
+ headers={"X-API-Key": API_KEY},
285
+ files=files,
286
+ timeout=(10, 120), # (connect timeout, read timeout)
287
+ )
288
+ # Helpful output even on non-2xx
289
+ try:
290
+ body = resp.json()
291
+ except Exception:
292
+ body = {"raw_text": resp.text}
293
+
294
+ if resp.status_code == 413:
295
+ die("Upload rejected: file too large (HTTP 413). Reduce size and try again.")
296
+
297
+ resp.raise_for_status()
298
+
299
+ submission_id = body.get("submission_id")
300
+ print("Successfully submitted.")
301
+ print("Server response:", body)
302
+ if submission_id:
303
+ print(f"Submission ID: {submission_id}")
304
+
305
+ except requests.exceptions.RequestException as e:
306
+ detail = getattr(e, "response", None)
307
+ print(f"Submission error: {e}")
308
+ if detail is not None:
309
+ try:
310
+ print("Server response:", detail.json())
311
+ except Exception:
312
+ print("Server response (text):", detail.text)
313
+ sys.exit(1)
314
+
utils.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import OlmoModel, OlmoPreTrainedModel, GenerationMixin, AutoConfig, AutoModelForSequenceClassification
2
+ from transformers.modeling_outputs import SequenceClassifierOutputWithPast
3
+ import torch
4
+
5
+ from peft import PeftModel, PeftConfig
6
+
7
+ from transformers import AutoConfig
8
+
9
+ import logging
10
+ from contextlib import contextmanager
11
+ from types import SimpleNamespace
12
+
13
+ # The custom model for using Olmo with a sequence classification task
14
+
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+
17
+ class OlmoForSequenceClassification(OlmoPreTrainedModel, GenerationMixin):
18
+ def __init__(self, config):
19
+ super().__init__(config)
20
+ self.model = OlmoModel(config)
21
+ self.num_labels = config.num_labels
22
+ self.classifier = torch.nn.Linear(config.hidden_size, config.num_labels)
23
+
24
+ # Initialize weights and apply final processing
25
+ self.post_init()
26
+
27
+ def forward(
28
+ self,
29
+ input_ids: torch.LongTensor = None,
30
+ attention_mask: torch.Tensor | None = None,
31
+ labels: torch.LongTensor | None = None,
32
+ **kwargs,
33
+ ) -> SequenceClassifierOutputWithPast:
34
+ outputs = self.model(
35
+ input_ids=input_ids,
36
+ attention_mask=attention_mask,
37
+ **kwargs,
38
+ )
39
+ logits = self.classifier(outputs.last_hidden_state)
40
+ pooled_logits = logits[:, -1] # NOTE: tokenizer.padding_side must be 'left'
41
+
42
+ loss = None
43
+ if labels is not None:
44
+ loss = self.loss_function(
45
+ logits=logits,
46
+ labels=labels,
47
+ pooled_logits=pooled_logits,
48
+ config=self.config,
49
+ )
50
+
51
+ return SequenceClassifierOutputWithPast(
52
+ loss=loss,
53
+ logits=pooled_logits,
54
+ past_key_values=outputs.past_key_values,
55
+ hidden_states=outputs.hidden_states,
56
+ attentions=outputs.attentions,
57
+ )
58
+
59
+ # The function for loading a fulltuning model
60
+
61
+ def get_fulltuning_model(model_path, model_type="olmo"):
62
+ if model_type == "olmo":
63
+ model = OlmoForSequenceClassification.from_pretrained(
64
+ model_path,
65
+ trust_remote_code=True,
66
+ torch_dtype=torch.float32,
67
+ ).to("cuda" if torch.cuda.is_available() else "cpu")
68
+ model.eval()
69
+ elif model_type == "pythia":
70
+ cfg = AutoConfig.from_pretrained(model_path, num_labels=3)
71
+ model = AutoModelForSequenceClassification.from_pretrained(
72
+ model_path,
73
+ config=cfg,
74
+ torch_dtype=torch.float32,
75
+ ).to(device)
76
+ else:
77
+ raise ValueError(f"Unsupported model_type: {model_type}")
78
+
79
+ return model
80
+
81
+ # The following function is used to suppress a "missing or unexpected params" warning.
82
+ # This warning is no reason for concern. It stems from the fact that the model is first loaded
83
+ # without a classifier head, which is added afterwards.
84
+
85
+ class DropLoadReport(logging.Filter):
86
+ def filter(self, record: logging.LogRecord) -> bool:
87
+ return "LOAD REPORT" not in record.getMessage()
88
+
89
+ @contextmanager
90
+ def suppress_load_report_only():
91
+ f = DropLoadReport()
92
+
93
+ names = [
94
+ "transformers.modeling_utils",
95
+ "transformers.modeling_tf_pytorch_utils",
96
+ "transformers",
97
+ ]
98
+ loggers = [logging.getLogger(n) for n in names]
99
+
100
+ for lg in loggers:
101
+ lg.addFilter(f)
102
+ try:
103
+ yield
104
+ finally:
105
+ for lg in loggers:
106
+ lg.removeFilter(f)
107
+
108
+ # The function for loading a softprompt model
109
+
110
+ def get_peft_model(model_path, model_type="olmo"):
111
+ peft_config = PeftConfig.from_pretrained(model_path)
112
+ device = "cuda" if torch.cuda.is_available() else "cpu"
113
+
114
+ if model_type == "olmo":
115
+ config = AutoConfig.from_pretrained(
116
+ peft_config.base_model_name_or_path,
117
+ trust_remote_code=True,
118
+ num_labels=2,
119
+ )
120
+ with suppress_load_report_only():
121
+ base = OlmoForSequenceClassification.from_pretrained(
122
+ peft_config.base_model_name_or_path,
123
+ trust_remote_code=True,
124
+ torch_dtype=torch.float32,
125
+ config=config,
126
+ ).to(device)
127
+
128
+ elif model_type == "pythia":
129
+ config = AutoConfig.from_pretrained(
130
+ peft_config.base_model_name_or_path,
131
+ num_labels=2,
132
+ )
133
+ with suppress_load_report_only():
134
+ base = AutoModelForSequenceClassification.from_pretrained(
135
+ peft_config.base_model_name_or_path,
136
+ config=config,
137
+ torch_dtype=torch.float32,
138
+ ).to(device)
139
+ else:
140
+ raise ValueError(f"Unsupported model_type: {model_type}")
141
+
142
+ with suppress_load_report_only():
143
+ model = PeftModel.from_pretrained(base, model_path).to(device)
144
+
145
+ model.is_prefix_tuning = str(peft_config.peft_type) == "PeftType.PREFIX_TUNING"
146
+
147
+ # helpful for batching / last-token pooling
148
+ if getattr(model.config, "pad_token_id", None) is None and getattr(model.config, "eos_token_id", None) is not None:
149
+ model.config.pad_token_id = model.config.eos_token_id
150
+ if hasattr(model, "base_model") and hasattr(model.base_model, "config"):
151
+ if getattr(model.base_model.config, "pad_token_id", None) is None and getattr(model.base_model.config, "eos_token_id", None) is not None:
152
+ model.base_model.config.pad_token_id = model.base_model.config.eos_token_id
153
+
154
+ model.eval()
155
+ return model
156
+
157
+ # This function helps when loading prefix finetuned models
158
+
159
+ def forward_peft_seqcls(model, **inputs):
160
+ if not getattr(model, "is_prefix_tuning", False):
161
+ return model(**inputs, use_cache=False)
162
+
163
+ input_ids = inputs.get("input_ids", None)
164
+ attention_mask = inputs.get("attention_mask", None)
165
+ inputs_embeds = inputs.get("inputs_embeds", None)
166
+ labels = inputs.get("labels", None)
167
+ output_attentions = inputs.get("output_attentions", None)
168
+ output_hidden_states = inputs.get("output_hidden_states", None)
169
+ return_dict = inputs.get("return_dict", True)
170
+
171
+ if input_ids is not None:
172
+ batch_size = input_ids.shape[0]
173
+ elif inputs_embeds is not None:
174
+ batch_size = inputs_embeds.shape[0]
175
+ else:
176
+ raise ValueError("Either input_ids or inputs_embeds must be provided.")
177
+
178
+ past_key_values = model.get_prompt(batch_size)
179
+
180
+ if attention_mask is not None:
181
+ num_virtual_tokens = model.active_peft_config.num_virtual_tokens
182
+ prefix_attention_mask = torch.ones(
183
+ batch_size,
184
+ num_virtual_tokens,
185
+ device=attention_mask.device,
186
+ dtype=attention_mask.dtype,
187
+ )
188
+ attention_mask = torch.cat((prefix_attention_mask, attention_mask), dim=1)
189
+
190
+ try:
191
+ return model.base_model(
192
+ input_ids=input_ids,
193
+ attention_mask=attention_mask,
194
+ inputs_embeds=inputs_embeds,
195
+ labels=labels,
196
+ past_key_values=past_key_values,
197
+ use_cache=False,
198
+ output_attentions=output_attentions,
199
+ output_hidden_states=output_hidden_states,
200
+ return_dict=return_dict,
201
+ )
202
+ except TypeError:
203
+ pass
204
+
205
+ transformer_backbone = model.base_model.get_submodule(model.transformer_backbone_name)
206
+
207
+ outputs = transformer_backbone(
208
+ input_ids=input_ids,
209
+ attention_mask=attention_mask,
210
+ inputs_embeds=inputs_embeds,
211
+ past_key_values=past_key_values,
212
+ use_cache=False,
213
+ output_attentions=output_attentions,
214
+ output_hidden_states=output_hidden_states,
215
+ return_dict=True,
216
+ )
217
+
218
+ hidden_states = outputs[0]
219
+
220
+ if "dropout" in [name for name, _ in model.base_model.named_children()]:
221
+ hidden_states = model.base_model.dropout(hidden_states)
222
+
223
+ cls_layer = model.base_model.get_submodule(model.cls_layer_name)
224
+ token_logits = cls_layer(hidden_states)
225
+
226
+ logits = token_logits[:, -1]
227
+
228
+ return SimpleNamespace(
229
+ logits=logits,
230
+ hidden_states=getattr(outputs, "hidden_states", None),
231
+ attentions=getattr(outputs, "attentions", None),
232
+ )