Spaces:
Runtime error
Runtime error
ffreemt
commited on
Commit
·
8afcc54
1
Parent(s):
d7fd091
Update
Browse files
app.py
CHANGED
|
@@ -2,8 +2,10 @@
|
|
| 2 |
# pylint: disable=invalid-name, missing-function-docstring, missing-class-docstring, redefined-outer-name, broad-except
|
| 3 |
import os
|
| 4 |
import time
|
|
|
|
| 5 |
|
| 6 |
import gradio as gr
|
|
|
|
| 7 |
|
| 8 |
# from mcli import predict
|
| 9 |
from huggingface_hub import hf_hub_download
|
|
@@ -17,6 +19,7 @@ if os.environ.get("MOSAICML_API_KEY") is None:
|
|
| 17 |
raise ValueError("git environment variable must be set")
|
| 18 |
# """
|
| 19 |
|
|
|
|
| 20 |
def predict(x, y, timeout):
|
| 21 |
logger.debug(f"{x=}, {y=}, {timeout=}")
|
| 22 |
|
|
@@ -31,6 +34,47 @@ def download_mpt_quant(destination_folder: str, repo_id: str, model_filename: st
|
|
| 31 |
)
|
| 32 |
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
class Chat:
|
| 35 |
default_system_prompt = "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers."
|
| 36 |
system_format = "<|im_start|>system\n{}<|im_end|>\n"
|
|
@@ -120,7 +164,21 @@ def call_inf_server(prompt):
|
|
| 120 |
# remove spl tokens from prompt
|
| 121 |
spl_tokens = ["<|im_start|>", "<|im_end|>"]
|
| 122 |
clean_prompt = prompt.replace(spl_tokens[0], "").replace(spl_tokens[1], "")
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
except Exception as e:
|
| 125 |
# assume it is our error
|
| 126 |
# just wait and try one more time
|
|
@@ -142,10 +200,36 @@ _ = """full url: https://huggingface.co/TheBloke/mpt-30B-chat-GGML/blob/main/mpt
|
|
| 142 |
repo_id = "TheBloke/mpt-30B-chat-GGML"
|
| 143 |
model_filename = "mpt-30b-chat.ggmlv0.q4_1.bin"
|
| 144 |
destination_folder = "models"
|
| 145 |
-
|
|
|
|
| 146 |
|
| 147 |
logger.info("done dl")
|
| 148 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
with gr.Blocks(
|
| 150 |
theme=gr.themes.Soft(),
|
| 151 |
css=".disclaimer {font-variant-caps: all-small-caps;}",
|
|
|
|
| 2 |
# pylint: disable=invalid-name, missing-function-docstring, missing-class-docstring, redefined-outer-name, broad-except
|
| 3 |
import os
|
| 4 |
import time
|
| 5 |
+
from dataclasses import asdict, dataclass
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
+
from ctransformers import AutoConfig, AutoModelForCausalLM
|
| 9 |
|
| 10 |
# from mcli import predict
|
| 11 |
from huggingface_hub import hf_hub_download
|
|
|
|
| 19 |
raise ValueError("git environment variable must be set")
|
| 20 |
# """
|
| 21 |
|
| 22 |
+
|
| 23 |
def predict(x, y, timeout):
|
| 24 |
logger.debug(f"{x=}, {y=}, {timeout=}")
|
| 25 |
|
|
|
|
| 34 |
)
|
| 35 |
|
| 36 |
|
| 37 |
+
@dataclass
|
| 38 |
+
class GenerationConfig:
|
| 39 |
+
temperature: float
|
| 40 |
+
top_k: int
|
| 41 |
+
top_p: float
|
| 42 |
+
repetition_penalty: float
|
| 43 |
+
max_new_tokens: int
|
| 44 |
+
seed: int
|
| 45 |
+
reset: bool
|
| 46 |
+
stream: bool
|
| 47 |
+
threads: int
|
| 48 |
+
stop: list[str]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def format_prompt(system_prompt: str, user_prompt: str):
|
| 52 |
+
"""format prompt based on: https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py"""
|
| 53 |
+
|
| 54 |
+
system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
|
| 55 |
+
user_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
|
| 56 |
+
assistant_prompt = f"<|im_start|>assistant\n"
|
| 57 |
+
|
| 58 |
+
return f"{system_prompt}{user_prompt}{assistant_prompt}"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def generate(
|
| 62 |
+
llm: AutoModelForCausalLM,
|
| 63 |
+
generation_config: GenerationConfig,
|
| 64 |
+
system_prompt: str,
|
| 65 |
+
user_prompt: str,
|
| 66 |
+
):
|
| 67 |
+
"""run model inference, will return a Generator if streaming is true"""
|
| 68 |
+
|
| 69 |
+
return llm(
|
| 70 |
+
format_prompt(
|
| 71 |
+
system_prompt,
|
| 72 |
+
user_prompt,
|
| 73 |
+
),
|
| 74 |
+
**asdict(generation_config),
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
class Chat:
|
| 79 |
default_system_prompt = "A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers."
|
| 80 |
system_format = "<|im_start|>system\n{}<|im_end|>\n"
|
|
|
|
| 164 |
# remove spl tokens from prompt
|
| 165 |
spl_tokens = ["<|im_start|>", "<|im_end|>"]
|
| 166 |
clean_prompt = prompt.replace(spl_tokens[0], "").replace(spl_tokens[1], "")
|
| 167 |
+
|
| 168 |
+
# return response[len(clean_prompt) :] # remove the prompt
|
| 169 |
+
try:
|
| 170 |
+
user_prompt = prompt
|
| 171 |
+
generator = generate(llm, generation_config, system_prompt, user_prompt.strip())
|
| 172 |
+
print(assistant_prefix, end=" ", flush=True)
|
| 173 |
+
for word in generator:
|
| 174 |
+
print(word, end="", flush=True)
|
| 175 |
+
print("")
|
| 176 |
+
response = word
|
| 177 |
+
except Exception as exc:
|
| 178 |
+
logger.error(exc)
|
| 179 |
+
response = f"{exc=}"
|
| 180 |
+
return response
|
| 181 |
+
|
| 182 |
except Exception as e:
|
| 183 |
# assume it is our error
|
| 184 |
# just wait and try one more time
|
|
|
|
| 200 |
repo_id = "TheBloke/mpt-30B-chat-GGML"
|
| 201 |
model_filename = "mpt-30b-chat.ggmlv0.q4_1.bin"
|
| 202 |
destination_folder = "models"
|
| 203 |
+
|
| 204 |
+
# download_mpt_quant(destination_folder, repo_id, model_filename)
|
| 205 |
|
| 206 |
logger.info("done dl")
|
| 207 |
|
| 208 |
+
config = AutoConfig.from_pretrained("mosaicml/mpt-30b-chat", context_length=8192)
|
| 209 |
+
llm = AutoModelForCausalLM.from_pretrained(
|
| 210 |
+
os.path.abspath("models/mpt-30b-chat.ggmlv0.q4_1.bin"),
|
| 211 |
+
model_type="mpt",
|
| 212 |
+
config=config,
|
| 213 |
+
)
|
| 214 |
+
|
| 215 |
+
system_prompt = "A conversation between a user and an LLM-based AI assistant named Local Assistant. Local Assistant gives helpful and honest answers."
|
| 216 |
+
|
| 217 |
+
generation_config = GenerationConfig(
|
| 218 |
+
temperature=0.2,
|
| 219 |
+
top_k=0,
|
| 220 |
+
top_p=0.9,
|
| 221 |
+
repetition_penalty=1.0,
|
| 222 |
+
max_new_tokens=512, # adjust as needed
|
| 223 |
+
seed=42,
|
| 224 |
+
reset=False, # reset history (cache)
|
| 225 |
+
stream=True, # streaming per word/token
|
| 226 |
+
threads=int(os.cpu_count() / 2), # adjust for your CPU
|
| 227 |
+
stop=["<|im_end|>", "|<"],
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
user_prefix = "[user]: "
|
| 231 |
+
assistant_prefix = "[assistant]:"
|
| 232 |
+
|
| 233 |
with gr.Blocks(
|
| 234 |
theme=gr.themes.Soft(),
|
| 235 |
css=".disclaimer {font-variant-caps: all-small-caps;}",
|