| | import pprint |
| | import subprocess |
| | from threading import Thread |
| |
|
| | import gradio as gr |
| | from optimum.intel.openvino import OVModelForCausalLM |
| | from transformers import AutoTokenizer, TextIteratorStreamer |
| |
|
| | result = subprocess.run(["lscpu"], text=True, capture_output=True) |
| | pprint.pprint(result.stdout) |
| |
|
| | original_model_id = "mistralai/Mistral-7B-Instruct-v0.2" |
| | model_id = "helenai/mistralai-Mistral-7B-Instruct-v0.2-ov" |
| |
|
| | model = OVModelForCausalLM.from_pretrained(model_id) |
| | tokenizer = AutoTokenizer.from_pretrained(model_id) |
| |
|
| |
|
| | def run_generation(user_text, top_p, temperature, top_k, max_new_tokens): |
| | |
| | message = [{"role": "user", "content": user_text}] |
| |
|
| | model_inputs = tokenizer.apply_chat_template(message, return_tensors="pt", return_dict=True) |
| |
|
| | |
| | |
| | streamer = TextIteratorStreamer( |
| | tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True |
| | ) |
| | generate_kwargs = dict( |
| | model_inputs, |
| | streamer=streamer, |
| | max_new_tokens=max_new_tokens, |
| | do_sample=True, |
| | top_p=top_p, |
| | temperature=float(temperature), |
| | top_k=top_k, |
| | ) |
| | t = Thread(target=model.generate, kwargs=generate_kwargs) |
| | t.start() |
| |
|
| | |
| | model_output = "" |
| | for new_text in streamer: |
| | model_output += new_text |
| | yield model_output |
| | return model_output |
| |
|
| |
|
| | def reset_textbox(): |
| | return gr.update(value="") |
| |
|
| |
|
| | with gr.Blocks() as demo: |
| | original_link = "https://huggingface.co/spaces/joaogante/transformers_streaming" |
| | gr.Markdown( |
| | "# OpenVINO and 🤗 Transformers 🔥Streaming🔥 on Gradio\n" |
| | "This demo showcases the use of the " |
| | "[streaming feature](https://huggingface.co/docs/transformers/main/en/generation_strategies#streaming) " |
| | "of 🤗 Transformers with OpenVINO models and Gradio to generate text in real-time. It uses " |
| | f"[{original_model_id}](https://huggingface.co/{original_model_id}), " |
| | "converted to OpenVINO.\n\n" |
| | f"This space was duplicated from {original_link} and modified for OpenVINO models." |
| | ) |
| |
|
| | with gr.Row(): |
| | with gr.Column(scale=4): |
| | user_text = gr.Textbox( |
| | label="User input", |
| | ) |
| | model_output = gr.Textbox(label="Model output", lines=10, interactive=False) |
| | button_submit = gr.Button(value="Submit") |
| |
|
| | with gr.Column(scale=1): |
| | max_new_tokens = gr.Slider( |
| | minimum=1, |
| | maximum=1000, |
| | value=250, |
| | step=1, |
| | interactive=True, |
| | label="Max New Tokens", |
| | ) |
| | top_p = gr.Slider( |
| | minimum=0.05, |
| | maximum=1.0, |
| | value=0.95, |
| | step=0.05, |
| | interactive=True, |
| | label="Top-p (nucleus sampling)", |
| | ) |
| | top_k = gr.Slider( |
| | minimum=1, |
| | maximum=50, |
| | value=50, |
| | step=1, |
| | interactive=True, |
| | label="Top-k", |
| | ) |
| | temperature = gr.Slider( |
| | minimum=0.1, |
| | maximum=5.0, |
| | value=0.8, |
| | step=0.1, |
| | interactive=True, |
| | label="Temperature", |
| | ) |
| |
|
| | user_text.submit( |
| | run_generation, |
| | [user_text, top_p, temperature, top_k, max_new_tokens], |
| | model_output, |
| | ) |
| | button_submit.click( |
| | run_generation, |
| | [user_text, top_p, temperature, top_k, max_new_tokens], |
| | model_output, |
| | ) |
| |
|
| | demo.queue(max_size=32).launch(enable_queue=True, server_name="0.0.0.0") |
| | |
| | |
| |
|