update

Browse files

Files changed (11) hide show

README.md +1 -1
adapter_config.json +3 -5
adapter_model.safetensors +2 -2
added_tokens.json +11 -10
cl100k_base.tiktoken +0 -0
merges.txt +0 -0
special_tokens_map.json +2 -2
tokenization_phi3_small.py +0 -338
tokenizer.json +2 -2
tokenizer_config.json +54 -35
vocab.json +0 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-base_model: microsoft/Phi-4-mini-instruct
 library_name: peft
 ---

 ---
+base_model: microsoft/Phi-3-medium-128k-instruct
 library_name: peft
 ---

adapter_config.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "alpha_pattern": {},
   "auto_mapping": null,
-  "base_model_name_or_path": "microsoft/Phi-4-mini-instruct",
   "bias": "none",
   "corda_config": null,
   "eva_config": null,
@@ -13,9 +13,9 @@
   "layers_pattern": null,
   "layers_to_transform": null,
   "loftq_config": {},
-  "lora_alpha": 8,
   "lora_bias": false,
-  "lora_dropout": 0.05,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
@@ -24,8 +24,6 @@
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
-    "down_proj",
-    "gate_up_proj",
     "o_proj",
     "qkv_proj"
   ],

 {
   "alpha_pattern": {},
   "auto_mapping": null,
+  "base_model_name_or_path": "microsoft/Phi-3-medium-128k-instruct",
   "bias": "none",
   "corda_config": null,
   "eva_config": null,
   "layers_pattern": null,
   "layers_to_transform": null,
   "loftq_config": {},
+  "lora_alpha": 16,
   "lora_bias": false,
+  "lora_dropout": 0.25,
   "megatron_config": null,
   "megatron_core": "megatron.core",
   "modules_to_save": null,
   "rank_pattern": {},
   "revision": null,
   "target_modules": [
     "o_proj",
     "qkv_proj"
   ],

adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:219f86143308bbf5a40521635145ad144d4da92a2d9db5710ffd6d68c0cf2b00
-size 46171456

 version https://git-lfs.github.com/spec/v1
+oid sha256:e211ae9f7ba6800d50e6b895986a46b5aec0979c030527053c9e70baf1197d8d
+size 29512680

added_tokens.json CHANGED Viewed

@@ -1,12 +1,13 @@
 {
-  "<|/tool_call|>": 200026,
-  "<|/tool|>": 200024,
-  "<|assistant|>": 200019,
-  "<|end|>": 200020,
-  "<|system|>": 200022,
-  "<|tag|>": 200028,
-  "<|tool_call|>": 200025,
-  "<|tool_response|>": 200027,
-  "<|tool|>": 200023,
-  "<|user|>": 200021
 }

 {
+  "<|assistant|>": 32001,
+  "<|endoftext|>": 32000,
+  "<|end|>": 32007,
+  "<|placeholder1|>": 32002,
+  "<|placeholder2|>": 32003,
+  "<|placeholder3|>": 32004,
+  "<|placeholder4|>": 32005,
+  "<|placeholder5|>": 32008,
+  "<|placeholder6|>": 32009,
+  "<|system|>": 32006,
+  "<|user|>": 32010
 }

cl100k_base.tiktoken DELETED Viewed

The diff for this file is too large to render. See raw diff

merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

special_tokens_map.json CHANGED Viewed

@@ -9,7 +9,7 @@
     }
   ],
   "bos_token": {
-    "content": "<|endoftext|>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
@@ -30,7 +30,7 @@
     "single_word": false
   },
   "unk_token": {
-    "content": "<|endoftext|>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

     }
   ],
   "bos_token": {
+    "content": "<s>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,
     "single_word": false
   },
   "unk_token": {
+    "content": "<unk>",
     "lstrip": false,
     "normalized": false,
     "rstrip": false,

tokenization_phi3_small.py DELETED Viewed

@@ -1,338 +0,0 @@
-# Adapted from https://huggingface.co/Qwen/Qwen-7B-Chat/blob/main/tokenization_qwen.py
-import os
-from typing import Collection, List, Optional, Dict, Set, Tuple, Union
-from functools import cached_property
-import base64
-import requests
-from transformers import PreTrainedTokenizer, AddedToken, AutoConfig
-from transformers.models.auto.tokenization_auto import get_tokenizer_config
-import tiktoken
-"""
-    This tokenizer is almost identical to tiktoken.get_encoding("cl100k_base")
-    with a few additional special tokens to support the ChatML format.
-    TODO(bapatra): Right now, I do not save the special tokens to the vocab file.
-    Maybe in the future, that would be useful? Can add that support later.
-"""
-def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]:
-    with open(tiktoken_bpe_file, "rb") as f:
-        contents = f.read()
-    return {
-        base64.b64decode(token): int(rank)
-        for token, rank in (line.split() for line in contents.splitlines() if line)
-    }
-# On the megatron codebase, we pad vocabularies to ensure matrix multiplication is fast.
-# this in turn causes some indices to be empty. We account for these empty indices by adding
-# dummy tokens to the tokenizer.
-EFFECTIVE_PADDED_VOCAB_SIZE = 100352
-ACTUAL_VOCAB_SIZE = 100276
-DUMMY_TOKENS = {
-    f"<|dummy_id_{11 + offset}|>": 100276 + offset
-    for offset in range(1, EFFECTIVE_PADDED_VOCAB_SIZE - ACTUAL_VOCAB_SIZE)
-}
-SPECIAL_TOKENS = {
-    # tiktoken.get_encoding("cl100k_base")._special_tokens
-    '<|endoftext|>': 100257,
-    '<|fim_prefix|>': 100258,
-    '<|fim_middle|>': 100259,
-    '<|fim_suffix|>': 100260,
-    # Special tokens for post-training
-    "<|system|>": 100261,
-    "<|user|>": 100262,
-    "<|assistant|>": 100263,
-    # Dummy unused tokens
-    "<|dummy_id_0|>": 100264,
-    "<|dummy_id_1|>": 100265,
-    # Special tokens for post-training continued
-    "<|end|>": 100266,
-    # Some dummy tokens, so that tokenization is contiguous and does not cause issues
-    # Note that the 100256th token of tiktoken.get_encoding("cl100k_base") does not
-    # actually map to anything. So we use a dummy token here.
-    "<|dummy_id_2|>": 100256,
-    # Likewise, tokens from 100267 to 100275 are also unused
-    "<|dummy_id_3|>": 100267,
-    "<|dummy_id_4|>": 100268,
-    "<|dummy_id_5|>": 100269,
-    "<|dummy_id_6|>": 100270,
-    "<|dummy_id_7|>": 100271,
-    "<|dummy_id_8|>": 100272,
-    "<|dummy_id_9|>": 100273,
-    "<|dummy_id_10|>": 100274,
-    "<|dummy_id_11|>": 100275,
-    # The final end of prompt token
-    # (unused, but present as a part of tiktoken.get_encoding("cl100k_base")._special_tokens)
-    '<|endofprompt|>': 100276,
-    # Dummy tokens to account for padding of the tokenizer
-    # We pad to ensure tensor cores are used for vocab multiplication
-    **DUMMY_TOKENS
-}
-class Phi3SmallTokenizer(PreTrainedTokenizer):
-    vocab_files_names = {
-        "vocab_file": "cl100k_base.tiktoken"
-    }
-    model_input_names: List[str] = ["input_ids", "attention_mask"]
-    padding_side = "left"
-    def __init__(
-        self,
-        vocab_file: Optional[str] = None,
-        errors: str = "replace",
-        **kwargs
-    ) -> None:
-        # PreTrainedTokenizer's init calls _add_tokens, which in turn checks
-        # if the token is present in `self.special_tokens``. Hence instantiating it here.
-        # The way Qwen gets around this is by checking against SPECIAL_TOKENS
-        # But I think it's better to check against the objects own `special_tokens`
-        # in case we eventually want to allow the tokenizer to have special tokens.
-        self.special_tokens = SPECIAL_TOKENS
-        super().__init__(**kwargs)
-        self.errors = errors
-        try:
-            base = tiktoken.get_encoding("cl100k_base")
-        # This deals with the scenario where user has restricted internet access
-        # and thus fails to download the tokenizer file from https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken
-        # It is assumed that user should be able to access files on huggingface hub.
-        except requests.RequestException:
-            import hashlib
-            from transformers.utils import cached_file
-            cached_tokenizer_path = cached_file(
-                    "microsoft/Phi-3-small-8k-instruct",
-                    "cl100k_base.tiktoken",
-                    _raise_exceptions_for_gated_repo=False,
-                    _raise_exceptions_for_missing_entries=False,
-                    _raise_exceptions_for_connection_errors=False
-                )
-            tiktoken_cache_dir = os.path.dirname(cached_tokenizer_path)
-            tiktoken_cache_path = os.path.join(
-                tiktoken_cache_dir,
-                hashlib.sha1("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken".encode()).hexdigest()
-            )
-            if not os.path.exists(tiktoken_cache_path):
-                os.rename(cached_tokenizer_path, tiktoken_cache_path)
-            os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir
-            base = tiktoken.get_encoding("cl100k_base")
-        if vocab_file is None:
-            self.mergeable_ranks: Dict[bytes, int] = base._mergeable_ranks
-        else:
-            self.mergeable_ranks = _load_tiktoken_bpe(vocab_file)
-        self.pat_str = base._pat_str
-        enc = tiktoken.Encoding(
-            name="phi3small",
-            pat_str=self.pat_str,
-            mergeable_ranks=self.mergeable_ranks,
-            special_tokens=self.special_tokens,
-        )
-        self.tokenizer = enc
-        self.decoder: Dict[int, bytes] = {
-            v: k for k, v in self.mergeable_ranks.items()
-        }
-        self.decoder.update({v: k for k, v in self.special_tokens.items()})
-        self.eod_id = self.tokenizer.eot_token
-        self._eos_token = self._convert_id_to_token(self.eod_id)
-        # Setting the bos_token to be the same as the eos_token
-        # Note that this is **not** the correct thing to do, and is done
-        # just so that some of the downstream libraries do not break.
-        self._bos_token = self._eos_token
-        # Assign the special tokens to class variables
-        self.system_id = self.special_tokens["<|system|>"]
-        self.user_id = self.special_tokens["<|user|>"]
-        self.assistant_id = self.special_tokens["<|assistant|>"]
-        self.end_id = self.special_tokens["<|end|>"]
-    @cached_property
-    def dummy_token_indices(self) -> List[int]:
-        # There are some additional special tokens in the cl100k_base tokenizer
-        # that we do not use. Hence, we also consider them to be dummy tokens.
-        additional_tokens = [
-            "<|fim_prefix|>",
-            "<|fim_middle|>",
-            "<|fim_suffix|>",
-            "<|endofprompt|>"
-        ]
-        dummy_token_indices = [index for token, index in self.special_tokens.items() if "dummy_id" in token]
-        dummy_token_indices.extend([self.special_tokens[token] for token in additional_tokens])
-        return sorted(dummy_token_indices)
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        del state["tokenizer"]
-        return state
-    def __setstate__(self, state):
-        self.__dict__ = state
-        enc = tiktoken.Encoding(
-            name="cl100k_im",
-            pat_str=self.pat_str,
-            mergeable_ranks=self.mergeable_ranks,
-            special_tokens=self.special_tokens,
-        )
-        self.tokenizer = enc
-    def __len__(self):
-        return self.tokenizer.n_vocab
-    @classmethod
-    def from_pretrained(
-        cls,
-        pretrained_model_name_or_path: Union[str, os.PathLike],
-        *init_inputs,
-        **kwargs,
-    ):
-        cls_kwargs = kwargs
-        # First try to load from the tokenization config if it exists
-        tokenization_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs)
-        if tokenization_config:
-            cls_kwargs = {
-                **tokenization_config,
-                **cls_kwargs
-            }
-        else:
-            config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True)
-            cls_kwargs["model_max_length"] = config.max_position_embeddings
-        return cls(**cls_kwargs)
-    def get_vocab(self) -> Dict[Union[str, bytes], int]:
-        return {**self.mergeable_ranks, **self.special_tokens}
-    def convert_tokens_to_ids(
-        self,
-        tokens: Union[bytes, str, List[Union[bytes, str]]]
-    ) -> Union[int, List[int]]:
-        ids = []
-        if isinstance(tokens, (str, bytes)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.mergeable_ranks.get(tokens)
-        ids: List[int] = []
-        for token in tokens:
-            ids.append(self.convert_tokens_to_ids(token))
-        return ids
-    def _add_tokens(
-            self,
-            new_tokens: Union[List[str], List[AddedToken]],
-            special_tokens: bool = False,
-    ) -> int:
-        if not special_tokens and new_tokens:
-            raise ValueError("Only special tokens can be added to this tokenizer")
-        for token in new_tokens:
-            surface_form = token.content if isinstance(token, AddedToken) else token
-            if surface_form not in self.special_tokens:
-                raise ValueError(
-                    "For now, we do not support unknown special tokens\n"
-                    "In the future, if there is a need for this, we can add special tokens to the tokenizer\n"
-                    "starting from rank 100261 - 100263 and then 100266 - 100275.\n"
-                    "And finally, we can re-construct the enc object back\n"
-                )
-        return 0
-    def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]:
-        file_path = os.path.join(save_directory, "cl100k_base.tiktoken")
-        with open(file_path, "w") as f:
-            for token, rank in self.mergeable_ranks.items():
-                line = base64.b64encode(token).decode("utf-8") + " " + str(rank) + "\n"
-                f.write(line)
-        return (file_path,)
-    def tokenize(
-        self,
-        text: str,
-        allowed_special: Union[Set, str] = "all",
-        disallowed_special: Union[Collection, str] = (),
-        **kwargs
-    ) -> List[Union[bytes, str]]:
-        tokens: List[Union[bytes, str]] = []
-        for token_id in self.tokenizer.encode(
-            text, allowed_special=allowed_special, disallowed_special=disallowed_special
-        ):
-            tokens.append(self.decoder[token_id])
-        return tokens
-    def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
-        """
-        Converts a sequence of tokens in a single string.
-        """
-        text = ""
-        temp = b""
-        for t in tokens:
-            if isinstance(t, str):
-                if temp:
-                    text += temp.decode("utf-8", errors=self.errors)
-                    temp = b""
-                text += t
-            elif isinstance(t, bytes):
-                temp += t
-            else:
-                raise TypeError("token should only be of type types or str")
-        if temp:
-            text += temp.decode("utf-8", errors=self.errors)
-        return text
-    @property
-    def vocab_size(self):
-        return self.tokenizer.n_vocab
-    @property
-    def eos_token_id(self) -> int:
-        return self.eod_id
-    def _convert_id_to_token(self, index: int) -> Union[bytes, str]:
-        """Converts an id to a token, special tokens included"""
-        if index in self.decoder:
-            return self.decoder[index]
-        raise ValueError("unknown ids")
-    def _convert_token_to_id(self, token: Union[bytes, str]) -> int:
-        """Converts a token to an id using the vocab, special tokens included"""
-        if token in self.special_tokens:
-            return self.special_tokens[token]
-        if token in self.mergeable_ranks:
-            return self.mergeable_ranks[token]
-        raise ValueError("unknown token")
-    def _tokenize(self, text: str, **kwargs):
-        """
-        Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based
-        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
-        Do NOT take care of added tokens.
-        """
-        raise NotImplementedError
-    def _decode(
-        self,
-        token_ids: Union[int, List[int]],
-        skip_special_tokens: bool = False,
-        errors: str = None,
-        **kwargs,
-    ) -> str:
-        if isinstance(token_ids, int):
-            token_ids = [token_ids]
-        if skip_special_tokens:
-            token_ids = [i for i in token_ids if i < self.eod_id]
-        return self.tokenizer.decode(token_ids, errors=errors or self.errors)

tokenizer.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03694739c3ccc766544b2ecc80a899498ffebd3f34419475ef2e2995c7210fd7
-size 15524096

 version https://git-lfs.github.com/spec/v1
+oid sha256:2923f15e986925cfb5e017bc9acbe2e24add5218d2b44558e1283fe76bb6df04
+size 3620658

tokenizer_config.json CHANGED Viewed

@@ -1,98 +1,114 @@
 {
   "add_bos_token": false,
   "add_eos_token": false,
-  "add_prefix_space": false,
   "added_tokens_decoder": {
-    "199999": {
-      "content": "<|endoftext|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "200018": {
-      "content": "<|endofprompt|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "200019": {
-      "content": "<|assistant|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
-      "special": true
     },
-    "200020": {
-      "content": "<|end|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
-    "200021": {
-      "content": "<|user|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
       "special": true
     },
-    "200022": {
-      "content": "<|system|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
       "special": true
     },
-    "200023": {
-      "content": "<|tool|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
-      "special": false
     },
-    "200024": {
-      "content": "<|/tool|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
-      "special": false
     },
-    "200025": {
-      "content": "<|tool_call|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
-      "special": false
     },
-    "200026": {
-      "content": "<|/tool_call|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
-      "special": false
     },
-    "200027": {
-      "content": "<|tool_response|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
-      "special": false
     },
-    "200028": {
-      "content": "<|tag|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
@@ -103,15 +119,18 @@
   "additional_special_tokens": [
     "<|end|>"
   ],
-  "bos_token": "<|endoftext|>",
-  "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and 'tools' in message and message['tools'] is not none %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|tool|>' + message['tools'] + '<|/tool|>' + '<|end|>' }}{% else %}{{ '<|' + message['role'] + '|>' + message['content'] + '<|end|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>' }}{% else %}{{ eos_token }}{% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "extra_special_tokens": {},
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
   "split_special_tokens": false,
-  "tokenizer_class": "GPT2Tokenizer",
-  "unk_token": "<|endoftext|>"
 }

 {
   "add_bos_token": false,
   "add_eos_token": false,
+  "add_prefix_space": null,
   "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "1": {
+      "content": "<s>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "2": {
+      "content": "</s>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
+      "special": false
     },
+    "32000": {
+      "content": "<|endoftext|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": false,
       "single_word": false,
       "special": true
     },
+    "32001": {
+      "content": "<|assistant|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
       "special": true
     },
+    "32002": {
+      "content": "<|placeholder1|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
       "special": true
     },
+    "32003": {
+      "content": "<|placeholder2|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
+      "special": true
     },
+    "32004": {
+      "content": "<|placeholder3|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
+      "special": true
     },
+    "32005": {
+      "content": "<|placeholder4|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
+      "special": true
     },
+    "32006": {
+      "content": "<|system|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
+      "special": true
     },
+    "32007": {
+      "content": "<|end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32008": {
+      "content": "<|placeholder5|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
       "single_word": false,
+      "special": true
+    },
+    "32009": {
+      "content": "<|placeholder6|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": true
     },
+    "32010": {
+      "content": "<|user|>",
       "lstrip": false,
       "normalized": false,
       "rstrip": true,
   "additional_special_tokens": [
     "<|end|>"
   ],
+  "bos_token": "<s>",
+  "chat_template": "{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "extra_special_tokens": {},
+  "legacy": false,
   "model_max_length": 131072,
   "pad_token": "<|endoftext|>",
   "padding_side": "right",
+  "sp_model_kwargs": {},
   "split_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
 }

vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff