""" Advanced examples for Qwen ONNX model inference Includes streaming, batch processing, and advanced parameters """ import onnxruntime_genai as og from pathlib import Path import time MODEL_DIR = Path(__file__).parent def streaming_generation(model_dir: str = None): """ Streaming generation - get tokens as they're generated """ if model_dir is None: model_dir = str(MODEL_DIR) print("Loading model...") model = og.Model(model_dir) tokenizer = og.Tokenizer(model) prompt = "Write a short poem about the sun" print(f"\nPrompt: {prompt}\n") print("Response: ", end="", flush=True) input_tokens = tokenizer.encode(prompt) config = model.get_default_generation_search_parameters() config.max_length = 200 config.temperature = 0.8 generator = og.Generator(model, config) generator.append_tokens(input_tokens) # Stream tokens while not generator.is_done(): generator.compute_logits() generator.generate_next_token() # Get current token output_tokens = generator.get_sequence(0) if len(output_tokens) > 0: last_token = output_tokens[-1:] token_text = tokenizer.decode(last_token) print(token_text, end="", flush=True) print("\n") def batch_processing(model_dir: str = None): """ Process multiple prompts """ if model_dir is None: model_dir = str(MODEL_DIR) model = og.Model(model_dir) tokenizer = og.Tokenizer(model) prompts = [ "What is Python?", "Explain machine learning in one sentence.", "How does neural networks work?", ] print("\nBatch Processing Example") print("="*60) config = model.get_default_generation_search_parameters() config.max_length = 150 config.temperature = 0.6 for prompt in prompts: print(f"\nPrompt: {prompt}") input_tokens = tokenizer.encode(prompt) generator = og.Generator(model, config) generator.append_tokens(input_tokens) while not generator.is_done(): generator.compute_logits() generator.generate_next_token() output_tokens = generator.get_sequence(0) output_text = tokenizer.decode(output_tokens) # Remove prompt from output if output_text.startswith(prompt): output_text = output_text[len(prompt):] print(f"Response: {output_text.strip()}") def chat_with_system_prompt(model_dir: str = None): """ Chat with a system prompt for better control """ if model_dir is None: model_dir = str(MODEL_DIR) model = og.Model(model_dir) tokenizer = og.Tokenizer(model) print("\nChat with System Prompt") print("="*60) # Format conversation conversation = [ {"role": "system", "content": "You are a Python expert. Answer questions concisely."}, {"role": "user", "content": "How do I read a file in Python?"}, ] # Build prompt in Qwen format prompt_text = "" for msg in conversation: role = msg["role"] content = msg["content"] prompt_text += f"<|im_start|>{role}\n{content}<|im_end|>\n" prompt_text += "<|im_start|>assistant\n" print(f"User: {conversation[-1]['content']}") input_tokens = tokenizer.encode(prompt_text) config = model.get_default_generation_search_parameters() config.max_length = 300 config.temperature = 0.6 config.top_p = 0.95 generator = og.Generator(model, config) generator.append_tokens(input_tokens) while not generator.is_done(): generator.compute_logits() generator.generate_next_token() output_tokens = generator.get_sequence(0) output_text = tokenizer.decode(output_tokens) print(f"\nAssistant: {output_text}") def temperature_sampling_example(model_dir: str = None): """ Demonstrate effect of different temperature values """ if model_dir is None: model_dir = str(MODEL_DIR) model = og.Model(model_dir) tokenizer = og.Tokenizer(model) prompt = "The best programming language for beginners is" print("\nTemperature Sampling Comparison") print("="*60) print(f"Prompt: {prompt}\n") temperatures = [0.2, 0.6, 0.9] for temp in temperatures: print(f"Temperature: {temp}") input_tokens = tokenizer.encode(prompt) config = model.get_default_generation_search_parameters() config.max_length = 100 config.temperature = temp generator = og.Generator(model, config) generator.append_tokens(input_tokens) while not generator.is_done(): generator.compute_logits() generator.generate_next_token() output_tokens = generator.get_sequence(0) output_text = tokenizer.decode(output_tokens) if output_text.startswith(prompt): output_text = output_text[len(prompt):] print(f"Response: {output_text.strip()}\n") def benchmark_inference_speed(model_dir: str = None): """ Benchmark inference speed """ if model_dir is None: model_dir = str(MODEL_DIR) model = og.Model(model_dir) tokenizer = og.Tokenizer(model) print("\nInference Speed Benchmark") print("="*60) prompts = [ "Hello, how are you?", "What is AI?", "Explain quantum computing.", ] config = model.get_default_generation_search_parameters() config.max_length = 100 config.temperature = 0.6 total_time = 0 total_tokens = 0 for prompt in prompts: start_time = time.time() input_tokens = tokenizer.encode(prompt) generator = og.Generator(model, config) generator.append_tokens(input_tokens) while not generator.is_done(): generator.compute_logits() generator.generate_next_token() elapsed = time.time() - start_time output_tokens = generator.get_sequence(0) generated_tokens = len(output_tokens) - len(input_tokens) tokens_per_sec = generated_tokens / elapsed if elapsed > 0 else 0 print(f"Prompt: {prompt}") print(f" Generated tokens: {generated_tokens}") print(f" Time: {elapsed:.2f}s") print(f" Speed: {tokens_per_sec:.2f} tokens/sec\n") total_time += elapsed total_tokens += generated_tokens avg_speed = total_tokens / total_time if total_time > 0 else 0 print(f"Average speed: {avg_speed:.2f} tokens/sec") if __name__ == "__main__": # Run examples print("Qwen ONNX Model - Advanced Examples\n") # Example 1: Streaming streaming_generation() # Example 2: Batch processing batch_processing() # Example 3: Chat with system prompt chat_with_system_prompt() # Example 4: Temperature comparison temperature_sampling_example() # Example 5: Benchmark benchmark_inference_speed()