MaximilianWeiland commited on
Commit
fd5f978
·
1 Parent(s): fb420c1

Add model weigths and tokenizer files

Browse files
README.md DELETED
@@ -1,3 +0,0 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "cl_mention_embedding",
3
+ "pretrained_model_name": "bert-base-uncased",
4
+ "hidden_size": 768,
5
+ "proj_dim": 128,
6
+ "prompt_template": "Social group of {} is: [MASK].",
7
+ "max_length": 128
8
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b7871cf714e031f2c94d56259b6f322dbe2cca78c602609f361fa735673b44a
3
+ size 438346792
prepare_upload.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prepares the cl_mention_embedding HuggingFace repo folder for upload.
3
+
4
+ Run from the thesis project root:
5
+ python huggingface_models/cl_mention_embedding/prepare_upload.py
6
+
7
+ Then upload with:
8
+ huggingface-cli upload maxwlnd/cl_mention_embedding huggingface_models/cl_mention_embedding .
9
+ """
10
+
11
+ import json
12
+ import sys
13
+ import torch
14
+ from pathlib import Path
15
+ from safetensors.torch import save_file
16
+ from transformers import AutoTokenizer
17
+
18
+ project_root = Path(__file__).resolve().parents[2]
19
+ sys.path.append(str(project_root))
20
+ from utils.clustering import ModelMask
21
+
22
+ output_dir = Path(__file__).parent
23
+ checkpoint_path = project_root / "04_clustering/model_checkpoint/checkpoint.pt"
24
+
25
+ # --- weights ---
26
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
27
+ model = ModelMask(tokenizer=tokenizer, pretrained_model_name="bert-base-uncased", proj_dim=128)
28
+ state_dict = torch.load(checkpoint_path, map_location="cpu")
29
+ model.load_state_dict(state_dict)
30
+ save_file(model.state_dict(), output_dir / "model.safetensors")
31
+ print("Saved model.safetensors")
32
+
33
+ # --- config ---
34
+ config = {
35
+ "model_type": "cl_mention_embedding",
36
+ "pretrained_model_name": "bert-base-uncased",
37
+ "hidden_size": model.hidden_size,
38
+ "proj_dim": model.proj_dim,
39
+ "prompt_template": "Social group of {} is: [MASK].",
40
+ "max_length": 128
41
+ }
42
+ with open(output_dir / "config.json", "w") as f:
43
+ json.dump(config, f, indent=2)
44
+ print("Saved config.json")
45
+
46
+ # --- tokenizer ---
47
+ tokenizer.save_pretrained(output_dir)
48
+ print("Saved tokenizer files")
49
+
50
+ print(f"\nDone. Files written to {output_dir}")
51
+ print("\nUpload with:")
52
+ print(" huggingface-cli upload maxwlnd/cl_mention_embedding huggingface_models/cl_mention_embedding .")
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff