Spaces:
Sleeping
Sleeping
| import os | |
| from typing import Optional, Tuple, Any | |
| from copy import deepcopy | |
| from functools import partial | |
| import gradio as gr | |
| import spaces | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline | |
| from dataclasses import dataclass | |
| prompt_format = '''<|im_start|>system | |
| {system_message}<|im_end|> | |
| <|im_start|>user | |
| {prompt}<|im_end|> | |
| <|im_start|>assistant | |
| ''' | |
| system_only_prompt_format = '''<|im_start|>system | |
| {system_message}<|im_end|> | |
| <|im_start|>user | |
| ''' | |
| system_prompt = '''You are given a partial input text for another AI chat interface. | |
| Propose auto-completion to the text. You have several roles: | |
| - Fight under-specification. | |
| - Complete text to save the user time. | |
| Don't suggest anything if there are no good suggestions. | |
| Make sure the suggestions are valid completions of the text! Suggest only up to 5 words ahead. The scheme of your answer should be "answer1;answer2;answer3" (return between 0 to 4 answers). | |
| Answers should be only the completions themselves. If you have nothing as a completion, return "<NOTHING>". | |
| Examples: | |
| (1) | |
| User: Help me write a sentiment analysis pipeline | |
| Assistant: using huggingface;using NLTK;using python | |
| (2) | |
| User: My name is | |
| Assistant: <NOTHING> (nothing much to contribute at this point. return nothing) | |
| (3) | |
| User: Help me find a present for my | |
| Assistant: girlfriend;mother;father;friend | |
| ''' | |
| # setup | |
| torch.set_grad_enabled(False) | |
| model_name = "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ" | |
| pipe = pipeline("text-generation", model=model_name, device='cuda') | |
| generate_kwargs = { | |
| 'max_new_tokens': 20, | |
| 'temperature': 0.8, | |
| 'repetition_penalty': 1.1 | |
| } | |
| def past_kv_to_device(past_kv, device, dtype): | |
| return tuple((torch.tensor(k).to(device).to(dtype), torch.tensor(v).to(device).to(dtype)) for k, v in past_kv) | |
| def detach_past_kv(past_kv): | |
| return tuple((k.cpu().detach().numpy().tolist(), v.cpu().detach().numpy().tolist()) for k, v in past_kv) | |
| def set_past_key_values(): | |
| model, tokenizer = pipe.model, pipe.tokenizer | |
| tokenized = tokenizer.encode( | |
| system_only_prompt_format.format(system_message=system_prompt), | |
| return_tensors='pt' | |
| ) | |
| # tokenized = tokenizer.apply_chat_template(start_messages, return_tensors='pt') | |
| # Check that this is indeed a prefix of the entire message | |
| # test_messages = [*start_messages, {'role': 'user', 'content': 'Hello World!'}] | |
| # tokenized_test = tokenizer.apply_chat_template(test_messages, return_tensors='pt') | |
| # assert (tokenized_test[:, :tokenized.shape[1]] == tokenized).all().cpu().item() | |
| return detach_past_kv(model(tokenized.to(model.device)).past_key_values) | |
| def generate(text, past_key_values): | |
| cur_generate_kwargs = deepcopy(generate_kwargs) | |
| if past_key_values: | |
| past_key_values = past_kv_to_device(past_key_values, pipe.model.device, pipe.model.dtype) | |
| cur_generate_kwargs.update({'past_key_values': past_key_values}) | |
| response = pipe( | |
| prompt_format.format(system_message=system_prompt, prompt=text), **cur_generate_kwargs | |
| )[0]['generated_text'] | |
| print(response) | |
| return response.split('<|im_start|>assistant\n')[1] | |
| # return response[-1]['content'] | |
| if __name__ == "__main__": | |
| with torch.no_grad(): | |
| # past_key_values = set_past_key_values() | |
| demo = gr.Interface( | |
| partial(generate, past_key_values=None), | |
| inputs="textbox", outputs="textbox" | |
| ) | |
| demo.launch() |