tokutsu commited on
Commit
a0f7b9d
·
1 Parent(s): 8b3b702

Update README & Add patch, script

Browse files
Files changed (5) hide show
  1. README.md +44 -5
  2. apply_patch.sh +28 -0
  3. hf.py.patch +56 -0
  4. index.html +0 -19
  5. style.css +0 -28
README.md CHANGED
@@ -1,12 +1,51 @@
1
  ---
2
  title: Exllamav2 Patch
3
- emoji: 🌖
4
- colorFrom: red
5
- colorTo: indigo
6
  sdk: static
7
- pinned: false
8
  license: mit
9
  short_description: Patch for ExLlamaV2 to support Unigram-based tokenizers
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  title: Exllamav2 Patch
 
 
 
3
  sdk: static
 
4
  license: mit
5
  short_description: Patch for ExLlamaV2 to support Unigram-based tokenizers
6
  ---
7
 
8
+ # Unigram Tokenizer Patch for ExLlamaV2
9
+
10
+ This repository provides a patch to support **Unigram-based tokenizers** with `ExLlamaV2`.
11
+ By default, using a Unigram tokenizer may result in an error during inference. This patch resolves that issue.
12
+
13
+ ---
14
+
15
+ ## Files
16
+
17
+ - `hf.py.patch`
18
+ A patch file that modifies `hf.py` in the ExLlamaV2 repository to support Unigram tokenizers.
19
+
20
+ - `apply_patch.sh`
21
+ A shell script to apply the patch.
22
+
23
+ ---
24
+
25
+ ## Usage
26
+
27
+ You can apply the patch with the following command:
28
+
29
+ ```bash
30
+ $ ./apply_patch.sh EXLLAMAV2_DIR
31
+ ```
32
+
33
+ Replace EXLLAMAV2_DIR with the path to your local clone of ExLlamaV2.
34
+
35
+ Example:
36
+ ```bash
37
+ $ ./apply_patch.sh ~/repos/exllamav2
38
+ ```
39
+
40
+ ## Purpose
41
+
42
+ The patch is intended to:
43
+ - Prevent runtime errors when using tokenizers.models.Unigram.
44
+ - Add fallback handling for missing unk_token attributes.
45
+ - Improve compatibility with models trained using SentencePiece + Unigram tokenizers.
46
+
47
+ ## Notes
48
+
49
+ - This patch is intended to be applied to the hf.py file in the ExLlamaV2 repository.
50
+ - Tested Versions: **v0.2.6** through **v0.2.8**
51
+ - If the patch fails, please ensure your copy of hf.py matches the original before applying.
apply_patch.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ THIS_DIR="$(cd $(dirname ${BASH_SOURCE}); pwd)"
4
+ EXLLAMAV2_DIR="$1"
5
+ PATCH_FILE="${THIS_DIR}/hf.py.patch"
6
+
7
+ main() {
8
+ if [[ -z "${EXLLAMAV2_DIR}" ]]; then
9
+ echo "[Usage] $0 EXLLAMAV2_DIR"
10
+ exit 1
11
+ fi
12
+
13
+ if [[ ! -d "${EXLLAMAV2_DIR}" ]]; then
14
+ echo "[ERROR] EXLLAMAV2_DIR does not exist. (${EXLLAMAV2_DIR})" >&2
15
+ exit 1
16
+ fi
17
+
18
+ if [[ ! -e "${PATCH_FILE}" ]]; then
19
+ echo "[ERROR] Patch file not found. (${PATCH_FILE})" >&2
20
+ exit 1
21
+ fi
22
+
23
+ if ! (cd "${EXLLAMAV2_DIR}" && patch -p1 < "${PATCH_FILE}"); then
24
+ echo "[ERROR] Failed to apply patch (${PATCH_FILE})" >&2
25
+ exit 1
26
+ fi
27
+ }
28
+ main "$@"
hf.py.patch ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ diff --git a/exllamav2/tokenizer/hf.py b/exllamav2/tokenizer/hf.py
2
+ index 56134d0..9fde261 100644
3
+ --- a/exllamav2/tokenizer/hf.py
4
+ +++ b/exllamav2/tokenizer/hf.py
5
+ @@ -1,4 +1,5 @@
6
+ from __future__ import annotations
7
+ +import json
8
+ from typing import List, Union
9
+ from exllamav2.tokenizer.base import ExLlamaV2TokenizerBase
10
+ from tokenizers import Tokenizer
11
+ @@ -10,6 +11,7 @@ class ExLlamaV2TokenizerHF(ExLlamaV2TokenizerBase):
12
+
13
+ space_char_: str
14
+ newline_char_: str
15
+ + unk_token_: str | None
16
+ vocab: list[str] | None
17
+
18
+ def __init__(self, tokenizer_json: str) -> None:
19
+ @@ -18,6 +20,7 @@ class ExLlamaV2TokenizerHF(ExLlamaV2TokenizerBase):
20
+ self.vocab = None
21
+ self.space_char_ = " "
22
+ self.newline_char_ = "\n"
23
+ + self.unk_token_ = None
24
+
25
+ self.hf_tokenizer = Tokenizer.from_file(tokenizer_json)
26
+
27
+ @@ -26,11 +29,18 @@ class ExLlamaV2TokenizerHF(ExLlamaV2TokenizerBase):
28
+ self.space_char_ = self.deduce_char_map(" ") # "Ġ"
29
+ self.newline_char_ = self.deduce_char_map("\n") # "Ċ"
30
+
31
+ + if isinstance(m, models.Unigram):
32
+ + unk_id = self._get_unk_id_from_tokenizer_json(tokenizer_json)
33
+ + self.unk_token_ = self.id_to_piece(unk_id)
34
+ + else:
35
+ + self.unk_token_ = getattr(m, "unk_token", None)
36
+ +
37
+ def unk_id(self) -> int or None: return None if self.unk_token() is None else self.piece_to_id(self.unk_token())
38
+ def pad_id(self) -> int or None: return None
39
+ def bos_id(self) -> int or None: return None
40
+ def eos_id(self) -> int or None: return None
41
+ - def unk_token(self) -> str or None: return self.hf_tokenizer.model.unk_token
42
+ + # def unk_token(self) -> str or None: return self.hf_tokenizer.model.unk_token
43
+ + def unk_token(self) -> str or None: return self.unk_token_
44
+ def pad_token(self) -> str or None: return None
45
+ def bos_token(self) -> str or None: return None
46
+ def eos_token(self) -> str or None: return None
47
+ @@ -84,3 +94,9 @@ class ExLlamaV2TokenizerHF(ExLlamaV2TokenizerBase):
48
+ def encode(self, text: list or str) -> list:
49
+ encoding = self.hf_tokenizer.encode(text, add_special_tokens = False)
50
+ return encoding.ids
51
+ +
52
+ + @staticmethod
53
+ + def _get_unk_id_from_tokenizer_json(tokenizer_json: str) -> str | None:
54
+ + with open(tokenizer_json, "r", encoding="utf-8") as f:
55
+ + tokenizer_json = json.load(f)
56
+ + return tokenizer_json.get("model", {}).get("unk_id", None)
index.html DELETED
@@ -1,19 +0,0 @@
1
- <!doctype html>
2
- <html>
3
- <head>
4
- <meta charset="utf-8" />
5
- <meta name="viewport" content="width=device-width" />
6
- <title>My static Space</title>
7
- <link rel="stylesheet" href="style.css" />
8
- </head>
9
- <body>
10
- <div class="card">
11
- <h1>Welcome to your static Space!</h1>
12
- <p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
13
- <p>
14
- Also don't forget to check the
15
- <a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
16
- </p>
17
- </div>
18
- </body>
19
- </html>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
style.css DELETED
@@ -1,28 +0,0 @@
1
- body {
2
- padding: 2rem;
3
- font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
4
- }
5
-
6
- h1 {
7
- font-size: 16px;
8
- margin-top: 0;
9
- }
10
-
11
- p {
12
- color: rgb(107, 114, 128);
13
- font-size: 15px;
14
- margin-bottom: 10px;
15
- margin-top: 5px;
16
- }
17
-
18
- .card {
19
- max-width: 620px;
20
- margin: 0 auto;
21
- padding: 16px;
22
- border: 1px solid lightgray;
23
- border-radius: 16px;
24
- }
25
-
26
- .card p:last-child {
27
- margin-bottom: 0;
28
- }