| """ |
| Advanced examples for Qwen ONNX model inference |
| Includes streaming, batch processing, and advanced parameters |
| """ |
|
|
| import onnxruntime_genai as og |
| from pathlib import Path |
| import time |
|
|
| MODEL_DIR = Path(__file__).parent |
|
|
| def streaming_generation(model_dir: str = None): |
| """ |
| Streaming generation - get tokens as they're generated |
| """ |
| if model_dir is None: |
| model_dir = str(MODEL_DIR) |
|
|
| print("Loading model...") |
| model = og.Model(model_dir) |
| tokenizer = og.Tokenizer(model) |
|
|
| prompt = "Write a short poem about the sun" |
| print(f"\nPrompt: {prompt}\n") |
| print("Response: ", end="", flush=True) |
|
|
| input_tokens = tokenizer.encode(prompt) |
|
|
| config = model.get_default_generation_search_parameters() |
| config.max_length = 200 |
| config.temperature = 0.8 |
|
|
| generator = og.Generator(model, config) |
| generator.append_tokens(input_tokens) |
|
|
| |
| while not generator.is_done(): |
| generator.compute_logits() |
| generator.generate_next_token() |
|
|
| |
| output_tokens = generator.get_sequence(0) |
| if len(output_tokens) > 0: |
| last_token = output_tokens[-1:] |
| token_text = tokenizer.decode(last_token) |
| print(token_text, end="", flush=True) |
|
|
| print("\n") |
|
|
|
|
| def batch_processing(model_dir: str = None): |
| """ |
| Process multiple prompts |
| """ |
| if model_dir is None: |
| model_dir = str(MODEL_DIR) |
|
|
| model = og.Model(model_dir) |
| tokenizer = og.Tokenizer(model) |
|
|
| prompts = [ |
| "What is Python?", |
| "Explain machine learning in one sentence.", |
| "How does neural networks work?", |
| ] |
|
|
| print("\nBatch Processing Example") |
| print("="*60) |
|
|
| config = model.get_default_generation_search_parameters() |
| config.max_length = 150 |
| config.temperature = 0.6 |
|
|
| for prompt in prompts: |
| print(f"\nPrompt: {prompt}") |
|
|
| input_tokens = tokenizer.encode(prompt) |
| generator = og.Generator(model, config) |
| generator.append_tokens(input_tokens) |
|
|
| while not generator.is_done(): |
| generator.compute_logits() |
| generator.generate_next_token() |
|
|
| output_tokens = generator.get_sequence(0) |
| output_text = tokenizer.decode(output_tokens) |
|
|
| |
| if output_text.startswith(prompt): |
| output_text = output_text[len(prompt):] |
|
|
| print(f"Response: {output_text.strip()}") |
|
|
|
|
| def chat_with_system_prompt(model_dir: str = None): |
| """ |
| Chat with a system prompt for better control |
| """ |
| if model_dir is None: |
| model_dir = str(MODEL_DIR) |
|
|
| model = og.Model(model_dir) |
| tokenizer = og.Tokenizer(model) |
|
|
| print("\nChat with System Prompt") |
| print("="*60) |
|
|
| |
| conversation = [ |
| {"role": "system", "content": "You are a Python expert. Answer questions concisely."}, |
| {"role": "user", "content": "How do I read a file in Python?"}, |
| ] |
|
|
| |
| prompt_text = "" |
| for msg in conversation: |
| role = msg["role"] |
| content = msg["content"] |
| prompt_text += f"<|im_start|>{role}\n{content}<|im_end|>\n" |
|
|
| prompt_text += "<|im_start|>assistant\n" |
|
|
| print(f"User: {conversation[-1]['content']}") |
|
|
| input_tokens = tokenizer.encode(prompt_text) |
|
|
| config = model.get_default_generation_search_parameters() |
| config.max_length = 300 |
| config.temperature = 0.6 |
| config.top_p = 0.95 |
|
|
| generator = og.Generator(model, config) |
| generator.append_tokens(input_tokens) |
|
|
| while not generator.is_done(): |
| generator.compute_logits() |
| generator.generate_next_token() |
|
|
| output_tokens = generator.get_sequence(0) |
| output_text = tokenizer.decode(output_tokens) |
|
|
| print(f"\nAssistant: {output_text}") |
|
|
|
|
| def temperature_sampling_example(model_dir: str = None): |
| """ |
| Demonstrate effect of different temperature values |
| """ |
| if model_dir is None: |
| model_dir = str(MODEL_DIR) |
|
|
| model = og.Model(model_dir) |
| tokenizer = og.Tokenizer(model) |
|
|
| prompt = "The best programming language for beginners is" |
|
|
| print("\nTemperature Sampling Comparison") |
| print("="*60) |
| print(f"Prompt: {prompt}\n") |
|
|
| temperatures = [0.2, 0.6, 0.9] |
|
|
| for temp in temperatures: |
| print(f"Temperature: {temp}") |
|
|
| input_tokens = tokenizer.encode(prompt) |
|
|
| config = model.get_default_generation_search_parameters() |
| config.max_length = 100 |
| config.temperature = temp |
|
|
| generator = og.Generator(model, config) |
| generator.append_tokens(input_tokens) |
|
|
| while not generator.is_done(): |
| generator.compute_logits() |
| generator.generate_next_token() |
|
|
| output_tokens = generator.get_sequence(0) |
| output_text = tokenizer.decode(output_tokens) |
|
|
| if output_text.startswith(prompt): |
| output_text = output_text[len(prompt):] |
|
|
| print(f"Response: {output_text.strip()}\n") |
|
|
|
|
| def benchmark_inference_speed(model_dir: str = None): |
| """ |
| Benchmark inference speed |
| """ |
| if model_dir is None: |
| model_dir = str(MODEL_DIR) |
|
|
| model = og.Model(model_dir) |
| tokenizer = og.Tokenizer(model) |
|
|
| print("\nInference Speed Benchmark") |
| print("="*60) |
|
|
| prompts = [ |
| "Hello, how are you?", |
| "What is AI?", |
| "Explain quantum computing.", |
| ] |
|
|
| config = model.get_default_generation_search_parameters() |
| config.max_length = 100 |
| config.temperature = 0.6 |
|
|
| total_time = 0 |
| total_tokens = 0 |
|
|
| for prompt in prompts: |
| start_time = time.time() |
|
|
| input_tokens = tokenizer.encode(prompt) |
| generator = og.Generator(model, config) |
| generator.append_tokens(input_tokens) |
|
|
| while not generator.is_done(): |
| generator.compute_logits() |
| generator.generate_next_token() |
|
|
| elapsed = time.time() - start_time |
| output_tokens = generator.get_sequence(0) |
|
|
| generated_tokens = len(output_tokens) - len(input_tokens) |
| tokens_per_sec = generated_tokens / elapsed if elapsed > 0 else 0 |
|
|
| print(f"Prompt: {prompt}") |
| print(f" Generated tokens: {generated_tokens}") |
| print(f" Time: {elapsed:.2f}s") |
| print(f" Speed: {tokens_per_sec:.2f} tokens/sec\n") |
|
|
| total_time += elapsed |
| total_tokens += generated_tokens |
|
|
| avg_speed = total_tokens / total_time if total_time > 0 else 0 |
| print(f"Average speed: {avg_speed:.2f} tokens/sec") |
|
|
|
|
| if __name__ == "__main__": |
| |
| print("Qwen ONNX Model - Advanced Examples\n") |
|
|
| |
| streaming_generation() |
|
|
| |
| batch_processing() |
|
|
| |
| chat_with_system_prompt() |
|
|
| |
| temperature_sampling_example() |
|
|
| |
| benchmark_inference_speed() |
|
|