Add model weigths and tokenizer files

Files changed (8) hide show

README.md +0 -3
config.json +8 -0
model.safetensors +3 -0
prepare_upload.py +52 -0
special_tokens_map.json +7 -0
tokenizer.json +0 -0
tokenizer_config.json +56 -0
vocab.txt +0 -0

README.md DELETED Viewed

@@ -1,3 +0,0 @@
----
-license: mit
----

config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "model_type": "cl_mention_embedding",
+  "pretrained_model_name": "bert-base-uncased",
+  "hidden_size": 768,
+  "proj_dim": 128,
+  "prompt_template": "Social group of {} is: [MASK].",
+  "max_length": 128
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7b7871cf714e031f2c94d56259b6f322dbe2cca78c602609f361fa735673b44a
+size 438346792

prepare_upload.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+Prepares the cl_mention_embedding HuggingFace repo folder for upload.
+Run from the thesis project root:
+    python huggingface_models/cl_mention_embedding/prepare_upload.py
+Then upload with:
+    huggingface-cli upload maxwlnd/cl_mention_embedding huggingface_models/cl_mention_embedding .
+"""
+import json
+import sys
+import torch
+from pathlib import Path
+from safetensors.torch import save_file
+from transformers import AutoTokenizer
+project_root = Path(__file__).resolve().parents[2]
+sys.path.append(str(project_root))
+from utils.clustering import ModelMask
+output_dir = Path(__file__).parent
+checkpoint_path = project_root / "04_clustering/model_checkpoint/checkpoint.pt"
+# --- weights ---
+tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+model = ModelMask(tokenizer=tokenizer, pretrained_model_name="bert-base-uncased", proj_dim=128)
+state_dict = torch.load(checkpoint_path, map_location="cpu")
+model.load_state_dict(state_dict)
+save_file(model.state_dict(), output_dir / "model.safetensors")
+print("Saved model.safetensors")
+# --- config ---
+config = {
+    "model_type": "cl_mention_embedding",
+    "pretrained_model_name": "bert-base-uncased",
+    "hidden_size": model.hidden_size,
+    "proj_dim": model.proj_dim,
+    "prompt_template": "Social group of {} is: [MASK].",
+    "max_length": 128
+}
+with open(output_dir / "config.json", "w") as f:
+    json.dump(config, f, indent=2)
+print("Saved config.json")
+# --- tokenizer ---
+tokenizer.save_pretrained(output_dir)
+print("Saved tokenizer files")
+print(f"\nDone. Files written to {output_dir}")
+print("\nUpload with:")
+print("  huggingface-cli upload maxwlnd/cl_mention_embedding huggingface_models/cl_mention_embedding .")

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "extra_special_tokens": {},
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff