Commit
·
85b6613
1
Parent(s):
5cdf399
CPU Flan inference is crashing, so trying to revert to previous package versions that worked
Browse files- app.py +4 -4
- chatfuncs/chatfuncs.py +11 -175
- requirements.txt +5 -6
app.py
CHANGED
|
@@ -113,14 +113,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
|
|
| 113 |
|
| 114 |
if torch_device == "cuda":
|
| 115 |
if "flan" in model_name:
|
| 116 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto"
|
| 117 |
else:
|
| 118 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto"
|
| 119 |
else:
|
| 120 |
if "flan" in model_name:
|
| 121 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(model_name
|
| 122 |
else:
|
| 123 |
-
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True
|
| 124 |
|
| 125 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
| 126 |
|
|
|
|
| 113 |
|
| 114 |
if torch_device == "cuda":
|
| 115 |
if "flan" in model_name:
|
| 116 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
|
| 117 |
else:
|
| 118 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")#, torch_dtype=torch.float16)
|
| 119 |
else:
|
| 120 |
if "flan" in model_name:
|
| 121 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)#, torch_dtype=torch.float16)
|
| 122 |
else:
|
| 123 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)#, torch_dtype=torch.float16)
|
| 124 |
|
| 125 |
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
|
| 126 |
|
chatfuncs/chatfuncs.py
CHANGED
|
@@ -99,66 +99,17 @@ context_length:int = 2048
|
|
| 99 |
sample = True
|
| 100 |
|
| 101 |
|
| 102 |
-
# class CtransInitConfig_gpu:
|
| 103 |
-
# def __init__(self, temperature=temperature,
|
| 104 |
-
# top_k=top_k,
|
| 105 |
-
# top_p=top_p,
|
| 106 |
-
# repetition_penalty=repetition_penalty,
|
| 107 |
-
# last_n_tokens=last_n_tokens,
|
| 108 |
-
# max_new_tokens=max_new_tokens,
|
| 109 |
-
# seed=seed,
|
| 110 |
-
# reset=reset,
|
| 111 |
-
# stream=stream,
|
| 112 |
-
# threads=threads,
|
| 113 |
-
# batch_size=batch_size,
|
| 114 |
-
# context_length=context_length,
|
| 115 |
-
# gpu_layers=gpu_layers):
|
| 116 |
-
# self.temperature = temperature
|
| 117 |
-
# self.top_k = top_k
|
| 118 |
-
# self.top_p = top_p
|
| 119 |
-
# self.repetition_penalty = repetition_penalty# repetition_penalty
|
| 120 |
-
# self.last_n_tokens = last_n_tokens
|
| 121 |
-
# self.max_new_tokens = max_new_tokens
|
| 122 |
-
# self.seed = seed
|
| 123 |
-
# self.reset = reset
|
| 124 |
-
# self.stream = stream
|
| 125 |
-
# self.threads = threads
|
| 126 |
-
# self.batch_size = batch_size
|
| 127 |
-
# self.context_length = context_length
|
| 128 |
-
# self.gpu_layers = gpu_layers
|
| 129 |
-
# # self.stop: list[str] = field(default_factory=lambda: [stop_string])
|
| 130 |
-
|
| 131 |
-
# def update_gpu(self, new_value):
|
| 132 |
-
# self.gpu_layers = new_value
|
| 133 |
-
|
| 134 |
-
# class CtransInitConfig_cpu(CtransInitConfig_gpu):
|
| 135 |
-
# def __init__(self):
|
| 136 |
-
# super().__init__()
|
| 137 |
-
# self.gpu_layers = 0
|
| 138 |
-
|
| 139 |
class CtransInitConfig_gpu:
|
| 140 |
-
def __init__(self,
|
| 141 |
-
#top_k=top_k,
|
| 142 |
-
#top_p=top_p,
|
| 143 |
-
#repetition_penalty=repetition_penalty,
|
| 144 |
last_n_tokens=last_n_tokens,
|
| 145 |
-
#max_new_tokens=max_new_tokens,
|
| 146 |
seed=seed,
|
| 147 |
-
#reset=reset,
|
| 148 |
-
#stream=stream,
|
| 149 |
n_threads=threads,
|
| 150 |
n_batch=batch_size,
|
| 151 |
n_ctx=4096,
|
| 152 |
n_gpu_layers=gpu_layers):
|
| 153 |
-
|
| 154 |
-
#self.top_k = top_k
|
| 155 |
-
#self.top_p = top_p
|
| 156 |
-
#self.repetition_penalty = repetition_penalty# repetition_penalty
|
| 157 |
self.last_n_tokens = last_n_tokens
|
| 158 |
-
#self.max_new_tokens = max_new_tokens
|
| 159 |
self.seed = seed
|
| 160 |
-
#self.reset = reset
|
| 161 |
-
#self.stream = stream
|
| 162 |
self.n_threads = n_threads
|
| 163 |
self.n_batch = n_batch
|
| 164 |
self.n_ctx = n_ctx
|
|
@@ -177,51 +128,22 @@ gpu_config = CtransInitConfig_gpu()
|
|
| 177 |
cpu_config = CtransInitConfig_cpu()
|
| 178 |
|
| 179 |
|
| 180 |
-
# class CtransGenGenerationConfig:
|
| 181 |
-
# def __init__(self, temperature=temperature,
|
| 182 |
-
# top_k=top_k,
|
| 183 |
-
# top_p=top_p,
|
| 184 |
-
# repetition_penalty=repetition_penalty,
|
| 185 |
-
# last_n_tokens=last_n_tokens,
|
| 186 |
-
# seed=seed,
|
| 187 |
-
# threads=threads,
|
| 188 |
-
# batch_size=batch_size,
|
| 189 |
-
# reset=True
|
| 190 |
-
# ):
|
| 191 |
-
# self.temperature = temperature
|
| 192 |
-
# self.top_k = top_k
|
| 193 |
-
# self.top_p = top_p
|
| 194 |
-
# self.repetition_penalty = repetition_penalty# repetition_penalty
|
| 195 |
-
# self.last_n_tokens = last_n_tokens
|
| 196 |
-
# self.seed = seed
|
| 197 |
-
# self.threads = threads
|
| 198 |
-
# self.batch_size = batch_size
|
| 199 |
-
# self.reset = reset
|
| 200 |
-
|
| 201 |
class CtransGenGenerationConfig:
|
| 202 |
def __init__(self, temperature=temperature,
|
| 203 |
top_k=top_k,
|
| 204 |
top_p=top_p,
|
| 205 |
repeat_penalty=repetition_penalty,
|
| 206 |
-
#last_n_tokens=last_n_tokens,
|
| 207 |
seed=seed,
|
| 208 |
stream=stream,
|
| 209 |
max_tokens=max_new_tokens
|
| 210 |
-
#threads=threads,
|
| 211 |
-
#batch_size=batch_size,
|
| 212 |
-
#reset=True
|
| 213 |
):
|
| 214 |
self.temperature = temperature
|
| 215 |
self.top_k = top_k
|
| 216 |
self.top_p = top_p
|
| 217 |
self.repeat_penalty = repeat_penalty
|
| 218 |
-
#self.last_n_tokens = last_n_tokens
|
| 219 |
self.seed = seed
|
| 220 |
self.max_tokens=max_tokens
|
| 221 |
self.stream = stream
|
| 222 |
-
#self.threads = threads
|
| 223 |
-
#self.batch_size = batch_size
|
| 224 |
-
#self.reset = reset
|
| 225 |
|
| 226 |
def update_temp(self, new_value):
|
| 227 |
self.temperature = new_value
|
|
@@ -417,93 +339,6 @@ def create_full_prompt(user_input, history, extracted_memory, vectorstore, embed
|
|
| 417 |
return history, docs_content_string, instruction_prompt_out
|
| 418 |
|
| 419 |
# Chat functions
|
| 420 |
-
# def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
| 421 |
-
# temperature=temperature,
|
| 422 |
-
# max_new_tokens=max_new_tokens,
|
| 423 |
-
# sample=sample,
|
| 424 |
-
# repetition_penalty=repetition_penalty,
|
| 425 |
-
# top_p=top_p,
|
| 426 |
-
# top_k=top_k
|
| 427 |
-
# ):
|
| 428 |
-
# #print("Model type is: ", model_type)
|
| 429 |
-
|
| 430 |
-
# #if not full_prompt.strip():
|
| 431 |
-
# # if history is None:
|
| 432 |
-
# # history = []
|
| 433 |
-
|
| 434 |
-
# # return history
|
| 435 |
-
|
| 436 |
-
# if model_type == "Flan Alpaca (small, fast)":
|
| 437 |
-
# # Get the model and tokenizer, and tokenize the user text.
|
| 438 |
-
# model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
|
| 439 |
-
|
| 440 |
-
# # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
|
| 441 |
-
# # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
|
| 442 |
-
# streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
|
| 443 |
-
# generate_kwargs = dict(
|
| 444 |
-
# model_inputs,
|
| 445 |
-
# streamer=streamer,
|
| 446 |
-
# max_new_tokens=max_new_tokens,
|
| 447 |
-
# do_sample=sample,
|
| 448 |
-
# repetition_penalty=repetition_penalty,
|
| 449 |
-
# top_p=top_p,
|
| 450 |
-
# temperature=temperature,
|
| 451 |
-
# top_k=top_k
|
| 452 |
-
# )
|
| 453 |
-
|
| 454 |
-
# print(generate_kwargs)
|
| 455 |
-
|
| 456 |
-
# t = Thread(target=model.generate, kwargs=generate_kwargs)
|
| 457 |
-
# t.start()
|
| 458 |
-
|
| 459 |
-
# # Pull the generated text from the streamer, and update the model output.
|
| 460 |
-
# start = time.time()
|
| 461 |
-
# NUM_TOKENS=0
|
| 462 |
-
# print('-'*4+'Start Generation'+'-'*4)
|
| 463 |
-
|
| 464 |
-
# history[-1][1] = ""
|
| 465 |
-
# for new_text in streamer:
|
| 466 |
-
# if new_text == None: new_text = ""
|
| 467 |
-
# history[-1][1] += new_text
|
| 468 |
-
# NUM_TOKENS+=1
|
| 469 |
-
# yield history
|
| 470 |
-
|
| 471 |
-
# time_generate = time.time() - start
|
| 472 |
-
# print('\n')
|
| 473 |
-
# print('-'*4+'End Generation'+'-'*4)
|
| 474 |
-
# print(f'Num of generated tokens: {NUM_TOKENS}')
|
| 475 |
-
# print(f'Time for complete generation: {time_generate}s')
|
| 476 |
-
# print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
| 477 |
-
# print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
| 478 |
-
|
| 479 |
-
# elif model_type == "Mistral Open Orca (larger, slow)":
|
| 480 |
-
# tokens = model.tokenize(full_prompt)
|
| 481 |
-
|
| 482 |
-
# gen_config = CtransGenGenerationConfig()
|
| 483 |
-
# gen_config.update_temp(temperature)
|
| 484 |
-
|
| 485 |
-
# print(vars(gen_config))
|
| 486 |
-
|
| 487 |
-
# # Pull the generated text from the streamer, and update the model output.
|
| 488 |
-
# start = time.time()
|
| 489 |
-
# NUM_TOKENS=0
|
| 490 |
-
# print('-'*4+'Start Generation'+'-'*4)
|
| 491 |
-
|
| 492 |
-
# history[-1][1] = ""
|
| 493 |
-
# for new_text in model.generate(tokens, **vars(gen_config)): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
|
| 494 |
-
# if new_text == None: new_text = ""
|
| 495 |
-
# history[-1][1] += model.detokenize(new_text) #new_text
|
| 496 |
-
# NUM_TOKENS+=1
|
| 497 |
-
# yield history
|
| 498 |
-
|
| 499 |
-
# time_generate = time.time() - start
|
| 500 |
-
# print('\n')
|
| 501 |
-
# print('-'*4+'End Generation'+'-'*4)
|
| 502 |
-
# print(f'Num of generated tokens: {NUM_TOKENS}')
|
| 503 |
-
# print(f'Time for complete generation: {time_generate}s')
|
| 504 |
-
# print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
|
| 505 |
-
# print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
|
| 506 |
-
|
| 507 |
|
| 508 |
def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
| 509 |
temperature=temperature,
|
|
@@ -523,8 +358,8 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
| 523 |
|
| 524 |
if model_type == "Flan Alpaca (small, fast)":
|
| 525 |
# Get the model and tokenizer, and tokenize the user text.
|
| 526 |
-
model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device)
|
| 527 |
-
|
| 528 |
# Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
|
| 529 |
# in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
|
| 530 |
streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
|
|
@@ -551,10 +386,13 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
| 551 |
|
| 552 |
history[-1][1] = ""
|
| 553 |
for new_text in streamer:
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
|
|
|
|
|
|
|
|
|
| 558 |
|
| 559 |
time_generate = time.time() - start
|
| 560 |
print('\n')
|
|
@@ -567,8 +405,6 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
|
| 567 |
elif model_type == "Mistral Open Orca (larger, slow)":
|
| 568 |
#tokens = model.tokenize(full_prompt)
|
| 569 |
|
| 570 |
-
temp = ""
|
| 571 |
-
|
| 572 |
gen_config = CtransGenGenerationConfig()
|
| 573 |
gen_config.update_temp(temperature)
|
| 574 |
|
|
|
|
| 99 |
sample = True
|
| 100 |
|
| 101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
class CtransInitConfig_gpu:
|
| 103 |
+
def __init__(self,
|
|
|
|
|
|
|
|
|
|
| 104 |
last_n_tokens=last_n_tokens,
|
|
|
|
| 105 |
seed=seed,
|
|
|
|
|
|
|
| 106 |
n_threads=threads,
|
| 107 |
n_batch=batch_size,
|
| 108 |
n_ctx=4096,
|
| 109 |
n_gpu_layers=gpu_layers):
|
| 110 |
+
|
|
|
|
|
|
|
|
|
|
| 111 |
self.last_n_tokens = last_n_tokens
|
|
|
|
| 112 |
self.seed = seed
|
|
|
|
|
|
|
| 113 |
self.n_threads = n_threads
|
| 114 |
self.n_batch = n_batch
|
| 115 |
self.n_ctx = n_ctx
|
|
|
|
| 128 |
cpu_config = CtransInitConfig_cpu()
|
| 129 |
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
class CtransGenGenerationConfig:
|
| 132 |
def __init__(self, temperature=temperature,
|
| 133 |
top_k=top_k,
|
| 134 |
top_p=top_p,
|
| 135 |
repeat_penalty=repetition_penalty,
|
|
|
|
| 136 |
seed=seed,
|
| 137 |
stream=stream,
|
| 138 |
max_tokens=max_new_tokens
|
|
|
|
|
|
|
|
|
|
| 139 |
):
|
| 140 |
self.temperature = temperature
|
| 141 |
self.top_k = top_k
|
| 142 |
self.top_p = top_p
|
| 143 |
self.repeat_penalty = repeat_penalty
|
|
|
|
| 144 |
self.seed = seed
|
| 145 |
self.max_tokens=max_tokens
|
| 146 |
self.stream = stream
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
def update_temp(self, new_value):
|
| 149 |
self.temperature = new_value
|
|
|
|
| 339 |
return history, docs_content_string, instruction_prompt_out
|
| 340 |
|
| 341 |
# Chat functions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 342 |
|
| 343 |
def produce_streaming_answer_chatbot(history, full_prompt, model_type,
|
| 344 |
temperature=temperature,
|
|
|
|
| 358 |
|
| 359 |
if model_type == "Flan Alpaca (small, fast)":
|
| 360 |
# Get the model and tokenizer, and tokenize the user text.
|
| 361 |
+
model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device)
|
| 362 |
+
|
| 363 |
# Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
|
| 364 |
# in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
|
| 365 |
streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
|
|
|
|
| 386 |
|
| 387 |
history[-1][1] = ""
|
| 388 |
for new_text in streamer:
|
| 389 |
+
try:
|
| 390 |
+
if new_text == None: new_text = ""
|
| 391 |
+
history[-1][1] += new_text
|
| 392 |
+
NUM_TOKENS+=1
|
| 393 |
+
yield history
|
| 394 |
+
except Exception as e:
|
| 395 |
+
print(f"Error during text generation: {e}")
|
| 396 |
|
| 397 |
time_generate = time.time() - start
|
| 398 |
print('\n')
|
|
|
|
| 405 |
elif model_type == "Mistral Open Orca (larger, slow)":
|
| 406 |
#tokens = model.tokenize(full_prompt)
|
| 407 |
|
|
|
|
|
|
|
| 408 |
gen_config = CtransGenGenerationConfig()
|
| 409 |
gen_config.update_temp(temperature)
|
| 410 |
|
requirements.txt
CHANGED
|
@@ -2,15 +2,14 @@ langchain
|
|
| 2 |
langchain-community
|
| 3 |
beautifulsoup4
|
| 4 |
pandas
|
| 5 |
-
transformers
|
| 6 |
llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
sentence_transformers
|
| 10 |
-
faiss-cpu
|
| 11 |
pypdf
|
| 12 |
python-docx
|
| 13 |
-
#ctransformers[cuda]
|
| 14 |
keybert
|
| 15 |
span_marker
|
| 16 |
gensim
|
|
|
|
| 2 |
langchain-community
|
| 3 |
beautifulsoup4
|
| 4 |
pandas
|
| 5 |
+
transformers==4.34.0
|
| 6 |
llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
|
| 7 |
+
torch \
|
| 8 |
+
--extra-index-url https://download.pytorch.org/whl/cu121
|
| 9 |
+
sentence_transformers==2.2.2
|
| 10 |
+
faiss-cpu==1.7.4
|
| 11 |
pypdf
|
| 12 |
python-docx
|
|
|
|
| 13 |
keybert
|
| 14 |
span_marker
|
| 15 |
gensim
|