Feature Extraction
Transformers
Safetensors
English
bert
contrastive-learning
embeddings
political-science
social-groups
clustering
text-embeddings-inference
Instructions to use maxwlnd/cl_mention_embedding with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use maxwlnd/cl_mention_embedding with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="maxwlnd/cl_mention_embedding")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("maxwlnd/cl_mention_embedding", dtype="auto") - Notebooks
- Google Colab
- Kaggle
MaximilianWeiland commited on
Commit ·
fd5f978
1
Parent(s): fb420c1
Add model weigths and tokenizer files
Browse files- README.md +0 -3
- config.json +8 -0
- model.safetensors +3 -0
- prepare_upload.py +52 -0
- special_tokens_map.json +7 -0
- tokenizer.json +0 -0
- tokenizer_config.json +56 -0
- vocab.txt +0 -0
README.md
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: mit
|
| 3 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
config.json
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "cl_mention_embedding",
|
| 3 |
+
"pretrained_model_name": "bert-base-uncased",
|
| 4 |
+
"hidden_size": 768,
|
| 5 |
+
"proj_dim": 128,
|
| 6 |
+
"prompt_template": "Social group of {} is: [MASK].",
|
| 7 |
+
"max_length": 128
|
| 8 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b7871cf714e031f2c94d56259b6f322dbe2cca78c602609f361fa735673b44a
|
| 3 |
+
size 438346792
|
prepare_upload.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Prepares the cl_mention_embedding HuggingFace repo folder for upload.
|
| 3 |
+
|
| 4 |
+
Run from the thesis project root:
|
| 5 |
+
python huggingface_models/cl_mention_embedding/prepare_upload.py
|
| 6 |
+
|
| 7 |
+
Then upload with:
|
| 8 |
+
huggingface-cli upload maxwlnd/cl_mention_embedding huggingface_models/cl_mention_embedding .
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import json
|
| 12 |
+
import sys
|
| 13 |
+
import torch
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from safetensors.torch import save_file
|
| 16 |
+
from transformers import AutoTokenizer
|
| 17 |
+
|
| 18 |
+
project_root = Path(__file__).resolve().parents[2]
|
| 19 |
+
sys.path.append(str(project_root))
|
| 20 |
+
from utils.clustering import ModelMask
|
| 21 |
+
|
| 22 |
+
output_dir = Path(__file__).parent
|
| 23 |
+
checkpoint_path = project_root / "04_clustering/model_checkpoint/checkpoint.pt"
|
| 24 |
+
|
| 25 |
+
# --- weights ---
|
| 26 |
+
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
| 27 |
+
model = ModelMask(tokenizer=tokenizer, pretrained_model_name="bert-base-uncased", proj_dim=128)
|
| 28 |
+
state_dict = torch.load(checkpoint_path, map_location="cpu")
|
| 29 |
+
model.load_state_dict(state_dict)
|
| 30 |
+
save_file(model.state_dict(), output_dir / "model.safetensors")
|
| 31 |
+
print("Saved model.safetensors")
|
| 32 |
+
|
| 33 |
+
# --- config ---
|
| 34 |
+
config = {
|
| 35 |
+
"model_type": "cl_mention_embedding",
|
| 36 |
+
"pretrained_model_name": "bert-base-uncased",
|
| 37 |
+
"hidden_size": model.hidden_size,
|
| 38 |
+
"proj_dim": model.proj_dim,
|
| 39 |
+
"prompt_template": "Social group of {} is: [MASK].",
|
| 40 |
+
"max_length": 128
|
| 41 |
+
}
|
| 42 |
+
with open(output_dir / "config.json", "w") as f:
|
| 43 |
+
json.dump(config, f, indent=2)
|
| 44 |
+
print("Saved config.json")
|
| 45 |
+
|
| 46 |
+
# --- tokenizer ---
|
| 47 |
+
tokenizer.save_pretrained(output_dir)
|
| 48 |
+
print("Saved tokenizer files")
|
| 49 |
+
|
| 50 |
+
print(f"\nDone. Files written to {output_dir}")
|
| 51 |
+
print("\nUpload with:")
|
| 52 |
+
print(" huggingface-cli upload maxwlnd/cl_mention_embedding huggingface_models/cl_mention_embedding .")
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cls_token": "[CLS]",
|
| 3 |
+
"mask_token": "[MASK]",
|
| 4 |
+
"pad_token": "[PAD]",
|
| 5 |
+
"sep_token": "[SEP]",
|
| 6 |
+
"unk_token": "[UNK]"
|
| 7 |
+
}
|
tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"added_tokens_decoder": {
|
| 3 |
+
"0": {
|
| 4 |
+
"content": "[PAD]",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false,
|
| 9 |
+
"special": true
|
| 10 |
+
},
|
| 11 |
+
"100": {
|
| 12 |
+
"content": "[UNK]",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false,
|
| 17 |
+
"special": true
|
| 18 |
+
},
|
| 19 |
+
"101": {
|
| 20 |
+
"content": "[CLS]",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false,
|
| 25 |
+
"special": true
|
| 26 |
+
},
|
| 27 |
+
"102": {
|
| 28 |
+
"content": "[SEP]",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": false,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false,
|
| 33 |
+
"special": true
|
| 34 |
+
},
|
| 35 |
+
"103": {
|
| 36 |
+
"content": "[MASK]",
|
| 37 |
+
"lstrip": false,
|
| 38 |
+
"normalized": false,
|
| 39 |
+
"rstrip": false,
|
| 40 |
+
"single_word": false,
|
| 41 |
+
"special": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"clean_up_tokenization_spaces": false,
|
| 45 |
+
"cls_token": "[CLS]",
|
| 46 |
+
"do_lower_case": true,
|
| 47 |
+
"extra_special_tokens": {},
|
| 48 |
+
"mask_token": "[MASK]",
|
| 49 |
+
"model_max_length": 512,
|
| 50 |
+
"pad_token": "[PAD]",
|
| 51 |
+
"sep_token": "[SEP]",
|
| 52 |
+
"strip_accents": null,
|
| 53 |
+
"tokenize_chinese_chars": true,
|
| 54 |
+
"tokenizer_class": "BertTokenizer",
|
| 55 |
+
"unk_token": "[UNK]"
|
| 56 |
+
}
|
vocab.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|