Update app.py
Browse files
app.py
CHANGED
|
@@ -3,7 +3,6 @@ import csv
|
|
| 3 |
import json
|
| 4 |
import torch
|
| 5 |
import shutil
|
| 6 |
-
import requests
|
| 7 |
import textwrap
|
| 8 |
import numpy as np
|
| 9 |
import pandas as pd
|
|
@@ -78,15 +77,16 @@ def get_test_sentence(target_lang: str, source_lang: str = "eng_Latn"):
|
|
| 78 |
translator = pipeline(task="translation", tokenizer=model_name, model=model_name)
|
| 79 |
return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
|
| 80 |
|
| 81 |
-
def push_to_hub(
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
|
|
|
| 85 |
api.create_repo(repo_id=repo_id, repo_type="model", private=private)
|
| 86 |
api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
|
| 87 |
|
| 88 |
-
def prune_model(model_name: str, language: str,
|
| 89 |
-
st.markdown(f"-
|
| 90 |
|
| 91 |
# Load the model and its tokenizer
|
| 92 |
model, tokenizer = load_model_and_tokenizer(model_name)
|
|
@@ -97,7 +97,7 @@ def prune_model(model_name: str, language: str, username: str, token: str):
|
|
| 97 |
embedding_params = count_parameters(model, layer_name="embeddings")
|
| 98 |
|
| 99 |
st.markdown(
|
| 100 |
-
f"- The model has **{all_params/1e6:.1f}M** parameters, of which **{embedding_params/all_params*100:.0f}%** "+
|
| 101 |
f"(i.e., {embedding_params/1e6:.1f}M params) come from the *embedding matrix* and its {tokenizer.vocab_size} token entries. "+
|
| 102 |
f"This means that the contextualization of text sequences is actually done by a *{model.config.num_hidden_layers}-layer Transformer encoder* "+
|
| 103 |
f"with **{encoder_params/1e6:.1f}M** parameters only."
|
|
@@ -110,77 +110,82 @@ def prune_model(model_name: str, language: str, username: str, token: str):
|
|
| 110 |
f"of the model vocabulary (i.e., {len(filtered_tokens)} out of the original {tokenizer.vocab_size} tokens)."
|
| 111 |
)
|
| 112 |
|
| 113 |
-
st.
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
st.
|
| 180 |
-
st.
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
# Show visually the result of the pruning process
|
| 186 |
pruned_all_params = count_parameters(new_model)
|
|
@@ -201,7 +206,7 @@ def prune_model(model_name: str, language: str, username: str, token: str):
|
|
| 201 |
st.plotly_chart(fig)
|
| 202 |
|
| 203 |
# Add a README to the pruned model repo
|
| 204 |
-
new_model_name = f"{
|
| 205 |
readme_content = textwrap.dedent(f"""
|
| 206 |
---
|
| 207 |
pipeline_tag: sentence-similarity
|
|
@@ -213,19 +218,16 @@ def prune_model(model_name: str, language: str, username: str, token: str):
|
|
| 213 |
- pruned
|
| 214 |
library_name: sentence-transformers
|
| 215 |
base_model: {model_name}
|
| 216 |
-
base_model_relation:
|
| 217 |
---
|
| 218 |
-
# {new_model_name.split('/')[-1]}
|
| 219 |
|
| 220 |
-
This model is a
|
|
|
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
the model's
|
| 225 |
-
|
| 226 |
-
This pruned model should perform similarly to the original model for {language.capitalize()} language tasks, but with a much smaller
|
| 227 |
-
memory footprint ({100 - pruned_all_params/all_params*100:.1f}% smaller). However, it may not perform well for other languages present
|
| 228 |
-
in the original multilingual model.
|
| 229 |
|
| 230 |
## Usage
|
| 231 |
|
|
@@ -238,13 +240,16 @@ def prune_model(model_name: str, language: str, username: str, token: str):
|
|
| 238 |
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
| 239 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
|
| 240 |
```
|
|
|
|
|
|
|
| 241 |
""")
|
| 242 |
with open(os.path.join(outdir, "README.md"), "w") as f:
|
| 243 |
f.write(readme_content)
|
| 244 |
|
| 245 |
-
st.
|
| 246 |
-
|
| 247 |
-
|
|
|
|
| 248 |
|
| 249 |
st.markdown("Done! You can now load your pruned model like this:")
|
| 250 |
st.code(f"""
|
|
@@ -261,7 +266,7 @@ def main():
|
|
| 261 |
st.markdown("""
|
| 262 |
This space helps you create a smaller, language-specific version of a multilingual text embedding model. Here's what it does:
|
| 263 |
|
| 264 |
-
1. 🌎 Takes a
|
| 265 |
2. ✂️ Trims it down to focus on just one language by removing unused tokens from its vocabulary
|
| 266 |
3. 🚀 Gives you a smaller model that works just as well for your chosen language
|
| 267 |
|
|
@@ -279,14 +284,17 @@ def main():
|
|
| 279 |
options=list(LANGUAGES.keys()),
|
| 280 |
format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
|
| 281 |
)
|
| 282 |
-
|
| 283 |
-
|
|
|
|
|
|
|
|
|
|
| 284 |
|
| 285 |
if st.button("Prune Model"):
|
| 286 |
-
if not
|
| 287 |
st.error("Your HF username and access token is required to save the pruned model on your account.")
|
| 288 |
else:
|
| 289 |
-
prune_model(model_name, language,
|
| 290 |
|
| 291 |
st.markdown(
|
| 292 |
"""
|
|
|
|
| 3 |
import json
|
| 4 |
import torch
|
| 5 |
import shutil
|
|
|
|
| 6 |
import textwrap
|
| 7 |
import numpy as np
|
| 8 |
import pandas as pd
|
|
|
|
| 77 |
translator = pipeline(task="translation", tokenizer=model_name, model=model_name)
|
| 78 |
return translator(text, src_lang=source_lang, tgt_lang=target_lang)[0]['translation_text']
|
| 79 |
|
| 80 |
+
def push_to_hub(hf_username: str, hf_token: str, model_dir: str, private: bool = False):
|
| 81 |
+
print(f"'{hf_token}'")
|
| 82 |
+
_ = whoami(token=hf_token)
|
| 83 |
+
api = HfApi(endpoint="https://huggingface.co", token=hf_token)
|
| 84 |
+
repo_id = f"{hf_username}/{model_dir.split('/')[-1]}"
|
| 85 |
api.create_repo(repo_id=repo_id, repo_type="model", private=private)
|
| 86 |
api.upload_folder(repo_id=repo_id, folder_path=model_dir, commit_message="Upload pruned model")
|
| 87 |
|
| 88 |
+
def prune_model(model_name: str, language: str, hf_username: str, hf_token: str):
|
| 89 |
+
st.markdown(f"- Let's prune the [**{model_name}**](https://huggingface.co/{model_name}) model to keep its **{language.capitalize()}** tokens only.")
|
| 90 |
|
| 91 |
# Load the model and its tokenizer
|
| 92 |
model, tokenizer = load_model_and_tokenizer(model_name)
|
|
|
|
| 97 |
embedding_params = count_parameters(model, layer_name="embeddings")
|
| 98 |
|
| 99 |
st.markdown(
|
| 100 |
+
f"- The original model has **{all_params/1e6:.1f}M** parameters, of which **{embedding_params/all_params*100:.0f}%** "+
|
| 101 |
f"(i.e., {embedding_params/1e6:.1f}M params) come from the *embedding matrix* and its {tokenizer.vocab_size} token entries. "+
|
| 102 |
f"This means that the contextualization of text sequences is actually done by a *{model.config.num_hidden_layers}-layer Transformer encoder* "+
|
| 103 |
f"with **{encoder_params/1e6:.1f}M** parameters only."
|
|
|
|
| 110 |
f"of the model vocabulary (i.e., {len(filtered_tokens)} out of the original {tokenizer.vocab_size} tokens)."
|
| 111 |
)
|
| 112 |
|
| 113 |
+
with st.status("Pruning the model...", expanded=True) as status:
|
| 114 |
+
st.write("- *Updating the tokenizer*")
|
| 115 |
+
outdir = f"{language}-{model_name.split('/')[-1]}"
|
| 116 |
+
|
| 117 |
+
# Export the tokenizer to a JSON string and access its vocabulary (list of lists: [[token, score], ...])
|
| 118 |
+
tokenizer_json = json.loads(tokenizer.backend_tokenizer.to_str())
|
| 119 |
+
original_vocab = tokenizer_json['model']['vocab']
|
| 120 |
+
|
| 121 |
+
# Build a mapping from tokens to their original IDs
|
| 122 |
+
original_token_to_id = {entry[0]: idx for idx, entry in enumerate(original_vocab)}
|
| 123 |
+
|
| 124 |
+
# Filter out the tokens to remove and reassign new IDs
|
| 125 |
+
new_id = 0
|
| 126 |
+
new_token_to_id = {}
|
| 127 |
+
new_id_to_original_id = {}
|
| 128 |
+
filtered_vocab_entries = []
|
| 129 |
+
|
| 130 |
+
for token, score in original_vocab:
|
| 131 |
+
if token in filtered_tokens:
|
| 132 |
+
filtered_vocab_entries.append([token, score])
|
| 133 |
+
new_token_to_id[token] = new_id
|
| 134 |
+
new_id_to_original_id[new_id] = original_token_to_id[token]
|
| 135 |
+
new_id += 1
|
| 136 |
+
|
| 137 |
+
# Update the vocab in the tokenizer JSON and rebuild the tokenizer from the modified JSON
|
| 138 |
+
tokenizer_json['model']['vocab'] = filtered_vocab_entries
|
| 139 |
+
new_backend_tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
|
| 140 |
+
|
| 141 |
+
# Create a new tokenizer instance and save it
|
| 142 |
+
new_tokenizer = PreTrainedTokenizerFast(tokenizer_object=new_backend_tokenizer, **tokenizer.init_kwargs)
|
| 143 |
+
new_tokenizer.save_pretrained(outdir)
|
| 144 |
+
|
| 145 |
+
st.write("- *Updating the embedding matrix*")
|
| 146 |
+
new_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
| 147 |
+
|
| 148 |
+
# Create a new embedding matrix and map the original vectors to their new IDs
|
| 149 |
+
original_embeddings = new_model.get_input_embeddings().weight.data
|
| 150 |
+
new_embeddings = torch.nn.Embedding(
|
| 151 |
+
num_embeddings=new_tokenizer.vocab_size,
|
| 152 |
+
embedding_dim=model.config.hidden_size,
|
| 153 |
+
padding_idx=new_tokenizer.pad_token_id,
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
for new_id in range(new_tokenizer.vocab_size):
|
| 157 |
+
original_id = new_id_to_original_id.get(new_id)
|
| 158 |
+
new_embeddings.weight.data[new_id] = original_embeddings[original_id]
|
| 159 |
+
|
| 160 |
+
new_model.set_input_embeddings(new_embeddings)
|
| 161 |
+
new_model.config.vocab_size = new_tokenizer.vocab_size
|
| 162 |
+
new_model.save_pretrained(outdir)
|
| 163 |
+
|
| 164 |
+
status.update(state="complete", expanded=True)
|
| 165 |
+
|
| 166 |
+
with st.status("Testing the conversion...", expanded=True) as status:
|
| 167 |
+
st.write(f"- *Checking the pruned tokenizer*")
|
| 168 |
+
assert len(new_tokenizer) == len(filtered_tokens), f"ERROR: new tokenizer size ({len(new_tokenizer)}) != number of filtered tokens ({len(filtered_tokens)})"
|
| 169 |
+
assert filtered_tokens == set(new_tokenizer.convert_ids_to_tokens(range(len(new_tokenizer)))), f"ERROR: The new tokenizer vocabulary doesn't match number of the filtered tokens"
|
| 170 |
+
|
| 171 |
+
st.write(f"- *Checking the pruned model*")
|
| 172 |
+
test_sentence = get_test_sentence(LANGUAGES[language]['nllb_code'])
|
| 173 |
+
with torch.inference_mode():
|
| 174 |
+
emb1 = model(**tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
|
| 175 |
+
emb2 = new_model(**new_tokenizer(test_sentence, return_tensors='pt')).last_hidden_state[:, 0][0].numpy()
|
| 176 |
+
diff = np.abs(emb1 - emb2).max()
|
| 177 |
+
assert diff < 1e-6, f"ERROR: Some dimensions of the two vectors have a non negligible difference ({diff})"
|
| 178 |
+
|
| 179 |
+
st.write(f"""All good! The output *[cls]* token embedding of the test sentence *"{test_sentence}"* should be similar:""")
|
| 180 |
+
col1, col2 = st.columns(2)
|
| 181 |
+
with col1:
|
| 182 |
+
st.markdown("Original model:")
|
| 183 |
+
st.code(f"{emb1.tolist()}")
|
| 184 |
+
with col2:
|
| 185 |
+
st.markdown("Pruned model:")
|
| 186 |
+
st.code(f"{emb2.tolist()}")
|
| 187 |
+
|
| 188 |
+
status.update(state="complete", expanded=True)
|
| 189 |
|
| 190 |
# Show visually the result of the pruning process
|
| 191 |
pruned_all_params = count_parameters(new_model)
|
|
|
|
| 206 |
st.plotly_chart(fig)
|
| 207 |
|
| 208 |
# Add a README to the pruned model repo
|
| 209 |
+
new_model_name = f"{hf_username}/{outdir.split('/')[-1]}"
|
| 210 |
readme_content = textwrap.dedent(f"""
|
| 211 |
---
|
| 212 |
pipeline_tag: sentence-similarity
|
|
|
|
| 218 |
- pruned
|
| 219 |
library_name: sentence-transformers
|
| 220 |
base_model: {model_name}
|
| 221 |
+
base_model_relation: quantized
|
| 222 |
---
|
| 223 |
+
# {LANGUAGES[language]['emoji']} {new_model_name.split('/')[-1]}
|
| 224 |
|
| 225 |
+
This model is a {100 - pruned_all_params/all_params*100:.1f}% smaller version of [{model_name}](https://huggingface.co/{model_name})
|
| 226 |
+
for the {language.capitalize()} language, created using the [mtem-pruner](https://huggingface.co/spaces/antoinelouis/mtem-pruner) space.
|
| 227 |
|
| 228 |
+
This pruned model should perform similarly to the original model for {language.capitalize()} language tasks with a much smaller
|
| 229 |
+
memory footprint. However, it may not perform well for other languages present in the original multilingual model as tokens not
|
| 230 |
+
commonly used in {language.capitalize()} were removed from the original multilingual model's vocabulary.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
## Usage
|
| 233 |
|
|
|
|
| 240 |
model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
|
| 241 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=True)
|
| 242 |
```
|
| 243 |
+
|
| 244 |
+
**Credits**: cc [@antoinelouis](https://huggingface.co/antoinelouis)
|
| 245 |
""")
|
| 246 |
with open(os.path.join(outdir, "README.md"), "w") as f:
|
| 247 |
f.write(readme_content)
|
| 248 |
|
| 249 |
+
with st.status("Pushing the pruned model to your Hugging Face account...", expanded=True) as status:
|
| 250 |
+
#push_to_hub(hf_username, hf_token, outdir)
|
| 251 |
+
shutil.rmtree(outdir)
|
| 252 |
+
status.update(state="complete", expanded=False)
|
| 253 |
|
| 254 |
st.markdown("Done! You can now load your pruned model like this:")
|
| 255 |
st.code(f"""
|
|
|
|
| 266 |
st.markdown("""
|
| 267 |
This space helps you create a smaller, language-specific version of a multilingual text embedding model. Here's what it does:
|
| 268 |
|
| 269 |
+
1. 🌎 Takes a state-of-the-art text embedding model that was trained on many languages
|
| 270 |
2. ✂️ Trims it down to focus on just one language by removing unused tokens from its vocabulary
|
| 271 |
3. 🚀 Gives you a smaller model that works just as well for your chosen language
|
| 272 |
|
|
|
|
| 284 |
options=list(LANGUAGES.keys()),
|
| 285 |
format_func=lambda x: f"{LANGUAGES[x]['emoji']} {x.capitalize()}"
|
| 286 |
)
|
| 287 |
+
col1, col2 = st.columns(2)
|
| 288 |
+
with col1:
|
| 289 |
+
hf_username = st.text_input("Your Hugging Face username", placeholder="antoinelouis")
|
| 290 |
+
with col2:
|
| 291 |
+
hf_token = st.text_input("Your Hugging Face access token", type="password", placeholder="hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")
|
| 292 |
|
| 293 |
if st.button("Prune Model"):
|
| 294 |
+
if not hf_username or not hf_token:
|
| 295 |
st.error("Your HF username and access token is required to save the pruned model on your account.")
|
| 296 |
else:
|
| 297 |
+
prune_model(model_name, language, hf_username, hf_token)
|
| 298 |
|
| 299 |
st.markdown(
|
| 300 |
"""
|