| import gradio as gr | |
| from huggingface_hub import InferenceClient | |
| """ | |
| Copied from inference in colab notebook | |
| """ | |
| # import torch | |
| # # Monkey-patch to avoid CUDA initialization issues | |
| # torch.cuda.get_device_capability = lambda *args, **kwargs: (0, 0) | |
| # from unsloth.chat_templates import get_chat_template | |
| # from unsloth import FastLanguageModel | |
| # # IMPORTING MODEL AND TOKENIZER ββββββββ | |
| # max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally! | |
| # dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+ | |
| # load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False. | |
| # model, tokenizer = FastLanguageModel.from_pretrained( | |
| # model_name = "llama_lora_model_1", | |
| # max_seq_length = max_seq_length, | |
| # dtype = dtype, | |
| # load_in_4bit = load_in_4bit, | |
| # ) | |
| # tokenizer = get_chat_template( | |
| # tokenizer, | |
| # chat_template = "llama-3.1", | |
| # ) | |
| # FastLanguageModel.for_inference(model) # Enable native 2x faster inference | |
| # # RUNNING INFERENCE ββββββββββββββββββββββββ | |
| # def respond( | |
| # message, | |
| # history: list[tuple[str, str]], | |
| # system_message, | |
| # max_tokens, | |
| # temperature, | |
| # top_p, | |
| # ): | |
| # messages = [{"role": "system", "content": system_message}] | |
| # for val in history: | |
| # if val[0]: | |
| # messages.append({"role": "user", "content": val[0]}) | |
| # if val[1]: | |
| # messages.append({"role": "assistant", "content": val[1]}) | |
| # messages.append({"role": "user", "content": message}) | |
| # inputs = tokenizer.apply_chat_template( | |
| # messages, | |
| # tokenize = True, | |
| # add_generation_prompt = True, # Must add for generation | |
| # return_tensors = "pt", | |
| # ) | |
| # outputs = model.generate(input_ids = inputs, max_new_tokens = max_tokens, use_cache = True, | |
| # temperature = 1.5, min_p = 0.1) | |
| # response = tokenizer.batch_decode(outputs) | |
| # yield response | |
| """ | |
| For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference | |
| """ | |
| client = InferenceClient(model="https://huggingface.co/Heit39/llama_lora_model_1") | |
| def respond( | |
| message, | |
| history: list[tuple[str, str]], | |
| system_message, | |
| max_tokens, | |
| temperature, | |
| top_p, | |
| ): | |
| messages = [{"role": "system", "content": system_message}] | |
| for val in history: | |
| if val[0]: | |
| messages.append({"role": "user", "content": val[0]}) | |
| if val[1]: | |
| messages.append({"role": "assistant", "content": val[1]}) | |
| messages.append({"role": "user", "content": message}) | |
| response = "" | |
| for message in client.chat_completion( | |
| messages, | |
| max_tokens=max_tokens, | |
| stream=True, | |
| temperature=temperature, | |
| top_p=top_p, | |
| ): | |
| token = message.choices[0].delta.content | |
| response += token | |
| yield response | |
| """ | |
| For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface | |
| """ | |
| demo = gr.ChatInterface( | |
| respond, | |
| additional_inputs=[ | |
| gr.Textbox(value="You are a friendly Chatbot.", label="System message"), | |
| gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
| gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p (nucleus sampling)", | |
| ), | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |