Spaces:
Running
Running
tokutsu
commited on
Commit
·
a0f7b9d
1
Parent(s):
8b3b702
Update README & Add patch, script
Browse files- README.md +44 -5
- apply_patch.sh +28 -0
- hf.py.patch +56 -0
- index.html +0 -19
- style.css +0 -28
README.md
CHANGED
|
@@ -1,12 +1,51 @@
|
|
| 1 |
---
|
| 2 |
title: Exllamav2 Patch
|
| 3 |
-
emoji: 🌖
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: indigo
|
| 6 |
sdk: static
|
| 7 |
-
pinned: false
|
| 8 |
license: mit
|
| 9 |
short_description: Patch for ExLlamaV2 to support Unigram-based tokenizers
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
title: Exllamav2 Patch
|
|
|
|
|
|
|
|
|
|
| 3 |
sdk: static
|
|
|
|
| 4 |
license: mit
|
| 5 |
short_description: Patch for ExLlamaV2 to support Unigram-based tokenizers
|
| 6 |
---
|
| 7 |
|
| 8 |
+
# Unigram Tokenizer Patch for ExLlamaV2
|
| 9 |
+
|
| 10 |
+
This repository provides a patch to support **Unigram-based tokenizers** with `ExLlamaV2`.
|
| 11 |
+
By default, using a Unigram tokenizer may result in an error during inference. This patch resolves that issue.
|
| 12 |
+
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
## Files
|
| 16 |
+
|
| 17 |
+
- `hf.py.patch`
|
| 18 |
+
A patch file that modifies `hf.py` in the ExLlamaV2 repository to support Unigram tokenizers.
|
| 19 |
+
|
| 20 |
+
- `apply_patch.sh`
|
| 21 |
+
A shell script to apply the patch.
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
## Usage
|
| 26 |
+
|
| 27 |
+
You can apply the patch with the following command:
|
| 28 |
+
|
| 29 |
+
```bash
|
| 30 |
+
$ ./apply_patch.sh EXLLAMAV2_DIR
|
| 31 |
+
```
|
| 32 |
+
|
| 33 |
+
Replace EXLLAMAV2_DIR with the path to your local clone of ExLlamaV2.
|
| 34 |
+
|
| 35 |
+
Example:
|
| 36 |
+
```bash
|
| 37 |
+
$ ./apply_patch.sh ~/repos/exllamav2
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
## Purpose
|
| 41 |
+
|
| 42 |
+
The patch is intended to:
|
| 43 |
+
- Prevent runtime errors when using tokenizers.models.Unigram.
|
| 44 |
+
- Add fallback handling for missing unk_token attributes.
|
| 45 |
+
- Improve compatibility with models trained using SentencePiece + Unigram tokenizers.
|
| 46 |
+
|
| 47 |
+
## Notes
|
| 48 |
+
|
| 49 |
+
- This patch is intended to be applied to the hf.py file in the ExLlamaV2 repository.
|
| 50 |
+
- Tested Versions: **v0.2.6** through **v0.2.8**
|
| 51 |
+
- If the patch fails, please ensure your copy of hf.py matches the original before applying.
|
apply_patch.sh
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
THIS_DIR="$(cd $(dirname ${BASH_SOURCE}); pwd)"
|
| 4 |
+
EXLLAMAV2_DIR="$1"
|
| 5 |
+
PATCH_FILE="${THIS_DIR}/hf.py.patch"
|
| 6 |
+
|
| 7 |
+
main() {
|
| 8 |
+
if [[ -z "${EXLLAMAV2_DIR}" ]]; then
|
| 9 |
+
echo "[Usage] $0 EXLLAMAV2_DIR"
|
| 10 |
+
exit 1
|
| 11 |
+
fi
|
| 12 |
+
|
| 13 |
+
if [[ ! -d "${EXLLAMAV2_DIR}" ]]; then
|
| 14 |
+
echo "[ERROR] EXLLAMAV2_DIR does not exist. (${EXLLAMAV2_DIR})" >&2
|
| 15 |
+
exit 1
|
| 16 |
+
fi
|
| 17 |
+
|
| 18 |
+
if [[ ! -e "${PATCH_FILE}" ]]; then
|
| 19 |
+
echo "[ERROR] Patch file not found. (${PATCH_FILE})" >&2
|
| 20 |
+
exit 1
|
| 21 |
+
fi
|
| 22 |
+
|
| 23 |
+
if ! (cd "${EXLLAMAV2_DIR}" && patch -p1 < "${PATCH_FILE}"); then
|
| 24 |
+
echo "[ERROR] Failed to apply patch (${PATCH_FILE})" >&2
|
| 25 |
+
exit 1
|
| 26 |
+
fi
|
| 27 |
+
}
|
| 28 |
+
main "$@"
|
hf.py.patch
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
diff --git a/exllamav2/tokenizer/hf.py b/exllamav2/tokenizer/hf.py
|
| 2 |
+
index 56134d0..9fde261 100644
|
| 3 |
+
--- a/exllamav2/tokenizer/hf.py
|
| 4 |
+
+++ b/exllamav2/tokenizer/hf.py
|
| 5 |
+
@@ -1,4 +1,5 @@
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
+import json
|
| 8 |
+
from typing import List, Union
|
| 9 |
+
from exllamav2.tokenizer.base import ExLlamaV2TokenizerBase
|
| 10 |
+
from tokenizers import Tokenizer
|
| 11 |
+
@@ -10,6 +11,7 @@ class ExLlamaV2TokenizerHF(ExLlamaV2TokenizerBase):
|
| 12 |
+
|
| 13 |
+
space_char_: str
|
| 14 |
+
newline_char_: str
|
| 15 |
+
+ unk_token_: str | None
|
| 16 |
+
vocab: list[str] | None
|
| 17 |
+
|
| 18 |
+
def __init__(self, tokenizer_json: str) -> None:
|
| 19 |
+
@@ -18,6 +20,7 @@ class ExLlamaV2TokenizerHF(ExLlamaV2TokenizerBase):
|
| 20 |
+
self.vocab = None
|
| 21 |
+
self.space_char_ = " "
|
| 22 |
+
self.newline_char_ = "\n"
|
| 23 |
+
+ self.unk_token_ = None
|
| 24 |
+
|
| 25 |
+
self.hf_tokenizer = Tokenizer.from_file(tokenizer_json)
|
| 26 |
+
|
| 27 |
+
@@ -26,11 +29,18 @@ class ExLlamaV2TokenizerHF(ExLlamaV2TokenizerBase):
|
| 28 |
+
self.space_char_ = self.deduce_char_map(" ") # "Ġ"
|
| 29 |
+
self.newline_char_ = self.deduce_char_map("\n") # "Ċ"
|
| 30 |
+
|
| 31 |
+
+ if isinstance(m, models.Unigram):
|
| 32 |
+
+ unk_id = self._get_unk_id_from_tokenizer_json(tokenizer_json)
|
| 33 |
+
+ self.unk_token_ = self.id_to_piece(unk_id)
|
| 34 |
+
+ else:
|
| 35 |
+
+ self.unk_token_ = getattr(m, "unk_token", None)
|
| 36 |
+
+
|
| 37 |
+
def unk_id(self) -> int or None: return None if self.unk_token() is None else self.piece_to_id(self.unk_token())
|
| 38 |
+
def pad_id(self) -> int or None: return None
|
| 39 |
+
def bos_id(self) -> int or None: return None
|
| 40 |
+
def eos_id(self) -> int or None: return None
|
| 41 |
+
- def unk_token(self) -> str or None: return self.hf_tokenizer.model.unk_token
|
| 42 |
+
+ # def unk_token(self) -> str or None: return self.hf_tokenizer.model.unk_token
|
| 43 |
+
+ def unk_token(self) -> str or None: return self.unk_token_
|
| 44 |
+
def pad_token(self) -> str or None: return None
|
| 45 |
+
def bos_token(self) -> str or None: return None
|
| 46 |
+
def eos_token(self) -> str or None: return None
|
| 47 |
+
@@ -84,3 +94,9 @@ class ExLlamaV2TokenizerHF(ExLlamaV2TokenizerBase):
|
| 48 |
+
def encode(self, text: list or str) -> list:
|
| 49 |
+
encoding = self.hf_tokenizer.encode(text, add_special_tokens = False)
|
| 50 |
+
return encoding.ids
|
| 51 |
+
+
|
| 52 |
+
+ @staticmethod
|
| 53 |
+
+ def _get_unk_id_from_tokenizer_json(tokenizer_json: str) -> str | None:
|
| 54 |
+
+ with open(tokenizer_json, "r", encoding="utf-8") as f:
|
| 55 |
+
+ tokenizer_json = json.load(f)
|
| 56 |
+
+ return tokenizer_json.get("model", {}).get("unk_id", None)
|
index.html
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
<!doctype html>
|
| 2 |
-
<html>
|
| 3 |
-
<head>
|
| 4 |
-
<meta charset="utf-8" />
|
| 5 |
-
<meta name="viewport" content="width=device-width" />
|
| 6 |
-
<title>My static Space</title>
|
| 7 |
-
<link rel="stylesheet" href="style.css" />
|
| 8 |
-
</head>
|
| 9 |
-
<body>
|
| 10 |
-
<div class="card">
|
| 11 |
-
<h1>Welcome to your static Space!</h1>
|
| 12 |
-
<p>You can modify this app directly by editing <i>index.html</i> in the Files and versions tab.</p>
|
| 13 |
-
<p>
|
| 14 |
-
Also don't forget to check the
|
| 15 |
-
<a href="https://huggingface.co/docs/hub/spaces" target="_blank">Spaces documentation</a>.
|
| 16 |
-
</p>
|
| 17 |
-
</div>
|
| 18 |
-
</body>
|
| 19 |
-
</html>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
style.css
DELETED
|
@@ -1,28 +0,0 @@
|
|
| 1 |
-
body {
|
| 2 |
-
padding: 2rem;
|
| 3 |
-
font-family: -apple-system, BlinkMacSystemFont, "Arial", sans-serif;
|
| 4 |
-
}
|
| 5 |
-
|
| 6 |
-
h1 {
|
| 7 |
-
font-size: 16px;
|
| 8 |
-
margin-top: 0;
|
| 9 |
-
}
|
| 10 |
-
|
| 11 |
-
p {
|
| 12 |
-
color: rgb(107, 114, 128);
|
| 13 |
-
font-size: 15px;
|
| 14 |
-
margin-bottom: 10px;
|
| 15 |
-
margin-top: 5px;
|
| 16 |
-
}
|
| 17 |
-
|
| 18 |
-
.card {
|
| 19 |
-
max-width: 620px;
|
| 20 |
-
margin: 0 auto;
|
| 21 |
-
padding: 16px;
|
| 22 |
-
border: 1px solid lightgray;
|
| 23 |
-
border-radius: 16px;
|
| 24 |
-
}
|
| 25 |
-
|
| 26 |
-
.card p:last-child {
|
| 27 |
-
margin-bottom: 0;
|
| 28 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|