Spaces:
Sleeping
Sleeping
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import transformers | |
| import torch | |
| import gradio as gr | |
| desired_dtype = torch.bfloat16 | |
| torch.set_default_dtype(torch.bfloat16) | |
| # checkpoint = "vsrinivas/falconlite2" | |
| checkpoint = "tiiuae/falcon-7b-instruct" | |
| model = AutoModelForCausalLM.from_pretrained( | |
| # checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True, torch_dtype="auto") | |
| checkpoint, device_map="auto", offload_folder="offload", trust_remote_code=True) | |
| # tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True, torch_dtype="auto") | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True) | |
| pipeline = transformers.pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| torch_dtype=torch.bfloat16, | |
| trust_remote_code=True, | |
| device_map="auto", | |
| ) | |
| def format_chat_prompt(message, chat_history, instruction): | |
| prompt = f"System:{instruction}" | |
| for turn in chat_history: | |
| user_message, bot_message = turn | |
| prompt = f"{prompt}\nUser: {user_message}\nAssistant: {bot_message}" | |
| prompt = f"{prompt}\nUser: {message}\nAssistant:" | |
| return prompt | |
| def generate_seqs(prompt, max_new_tokens=None, stop_sequence=None, temperature=None): | |
| output = pipeline(prompt, | |
| max_length=200, | |
| truncation=True, | |
| max_new_tokens = max_new_tokens, | |
| stop_sequence = stop_sequence, | |
| temperature=temperature, | |
| do_sample=True, | |
| top_k=10, | |
| num_return_sequences=1, | |
| eos_token_id=tokenizer.eos_token_id) | |
| return output[0]['generated_text'] | |
| def respond(message, chat_history, instruction, temperature=0.7): | |
| prompt = format_chat_prompt(message, chat_history, instruction) | |
| chat_history = chat_history + [[message, ""]] | |
| stream = generate_seqs(prompt = prompt, | |
| max_new_tokens=8192, | |
| stop_sequence=["\nUser:", "<|endoftext|>"], | |
| temperature=temperature).split('Assistant: ')[-1] | |
| #stop_sequence to not generate the user answer | |
| acc_text = "" | |
| #Streaming the tokens | |
| for idx, response in enumerate(stream): | |
| # text_token = response.token.text | |
| text_token = response | |
| # if response.details: | |
| # return | |
| if idx == 0 and text_token.startswith(" "): | |
| text_token = text_token[1:] | |
| acc_text += text_token | |
| last_turn = list(chat_history.pop(-1)) | |
| last_turn[-1] += acc_text | |
| chat_history = chat_history + [last_turn] | |
| yield "", chat_history | |
| acc_text = "" | |
| with gr.Blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # General purpose chatbot - test & demo app by Srinivas.V.. | |
| ## As this is a free hosted platform (Computing and Memory limitations), you will find it slow and the app may not provide appropriate answers after a few dialogues. Type in your prompt, click/ submit and wait for the resonse before typing in your next prompt. | |
| """) | |
| chatbot = gr.Chatbot(height=500) #just to fit the notebook | |
| msg = gr.Textbox(label="Prompt") | |
| with gr.Accordion(label="Advanced options",open=False): | |
| system = gr.Textbox(label="System message", lines=2, value="A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.") | |
| temperature = gr.Slider(label="temperature", minimum=0.1, maximum=1, value=0.7, step=0.1) | |
| btn = gr.Button("Submit") | |
| clear = gr.ClearButton(components=[msg, chatbot, system, temperature], value="Clear console") | |
| btn.click(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot]) | |
| msg.submit(respond, inputs=[msg, chatbot, system, temperature], outputs=[msg, chatbot]) | |
| gr.close_all() | |
| demo.queue().launch() |