WebWorld-8B-Onnx / advanced_examples.py
Prince-1's picture
Add files using upload-large-folder tool
5abb996 verified
"""
Advanced examples for Qwen ONNX model inference
Includes streaming, batch processing, and advanced parameters
"""
import onnxruntime_genai as og
from pathlib import Path
import time
MODEL_DIR = Path(__file__).parent
def streaming_generation(model_dir: str = None):
"""
Streaming generation - get tokens as they're generated
"""
if model_dir is None:
model_dir = str(MODEL_DIR)
print("Loading model...")
model = og.Model(model_dir)
tokenizer = og.Tokenizer(model)
prompt = "Write a short poem about the sun"
print(f"\nPrompt: {prompt}\n")
print("Response: ", end="", flush=True)
input_tokens = tokenizer.encode(prompt)
config = model.get_default_generation_search_parameters()
config.max_length = 200
config.temperature = 0.8
generator = og.Generator(model, config)
generator.append_tokens(input_tokens)
# Stream tokens
while not generator.is_done():
generator.compute_logits()
generator.generate_next_token()
# Get current token
output_tokens = generator.get_sequence(0)
if len(output_tokens) > 0:
last_token = output_tokens[-1:]
token_text = tokenizer.decode(last_token)
print(token_text, end="", flush=True)
print("\n")
def batch_processing(model_dir: str = None):
"""
Process multiple prompts
"""
if model_dir is None:
model_dir = str(MODEL_DIR)
model = og.Model(model_dir)
tokenizer = og.Tokenizer(model)
prompts = [
"What is Python?",
"Explain machine learning in one sentence.",
"How does neural networks work?",
]
print("\nBatch Processing Example")
print("="*60)
config = model.get_default_generation_search_parameters()
config.max_length = 150
config.temperature = 0.6
for prompt in prompts:
print(f"\nPrompt: {prompt}")
input_tokens = tokenizer.encode(prompt)
generator = og.Generator(model, config)
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.compute_logits()
generator.generate_next_token()
output_tokens = generator.get_sequence(0)
output_text = tokenizer.decode(output_tokens)
# Remove prompt from output
if output_text.startswith(prompt):
output_text = output_text[len(prompt):]
print(f"Response: {output_text.strip()}")
def chat_with_system_prompt(model_dir: str = None):
"""
Chat with a system prompt for better control
"""
if model_dir is None:
model_dir = str(MODEL_DIR)
model = og.Model(model_dir)
tokenizer = og.Tokenizer(model)
print("\nChat with System Prompt")
print("="*60)
# Format conversation
conversation = [
{"role": "system", "content": "You are a Python expert. Answer questions concisely."},
{"role": "user", "content": "How do I read a file in Python?"},
]
# Build prompt in Qwen format
prompt_text = ""
for msg in conversation:
role = msg["role"]
content = msg["content"]
prompt_text += f"<|im_start|>{role}\n{content}<|im_end|>\n"
prompt_text += "<|im_start|>assistant\n"
print(f"User: {conversation[-1]['content']}")
input_tokens = tokenizer.encode(prompt_text)
config = model.get_default_generation_search_parameters()
config.max_length = 300
config.temperature = 0.6
config.top_p = 0.95
generator = og.Generator(model, config)
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.compute_logits()
generator.generate_next_token()
output_tokens = generator.get_sequence(0)
output_text = tokenizer.decode(output_tokens)
print(f"\nAssistant: {output_text}")
def temperature_sampling_example(model_dir: str = None):
"""
Demonstrate effect of different temperature values
"""
if model_dir is None:
model_dir = str(MODEL_DIR)
model = og.Model(model_dir)
tokenizer = og.Tokenizer(model)
prompt = "The best programming language for beginners is"
print("\nTemperature Sampling Comparison")
print("="*60)
print(f"Prompt: {prompt}\n")
temperatures = [0.2, 0.6, 0.9]
for temp in temperatures:
print(f"Temperature: {temp}")
input_tokens = tokenizer.encode(prompt)
config = model.get_default_generation_search_parameters()
config.max_length = 100
config.temperature = temp
generator = og.Generator(model, config)
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.compute_logits()
generator.generate_next_token()
output_tokens = generator.get_sequence(0)
output_text = tokenizer.decode(output_tokens)
if output_text.startswith(prompt):
output_text = output_text[len(prompt):]
print(f"Response: {output_text.strip()}\n")
def benchmark_inference_speed(model_dir: str = None):
"""
Benchmark inference speed
"""
if model_dir is None:
model_dir = str(MODEL_DIR)
model = og.Model(model_dir)
tokenizer = og.Tokenizer(model)
print("\nInference Speed Benchmark")
print("="*60)
prompts = [
"Hello, how are you?",
"What is AI?",
"Explain quantum computing.",
]
config = model.get_default_generation_search_parameters()
config.max_length = 100
config.temperature = 0.6
total_time = 0
total_tokens = 0
for prompt in prompts:
start_time = time.time()
input_tokens = tokenizer.encode(prompt)
generator = og.Generator(model, config)
generator.append_tokens(input_tokens)
while not generator.is_done():
generator.compute_logits()
generator.generate_next_token()
elapsed = time.time() - start_time
output_tokens = generator.get_sequence(0)
generated_tokens = len(output_tokens) - len(input_tokens)
tokens_per_sec = generated_tokens / elapsed if elapsed > 0 else 0
print(f"Prompt: {prompt}")
print(f" Generated tokens: {generated_tokens}")
print(f" Time: {elapsed:.2f}s")
print(f" Speed: {tokens_per_sec:.2f} tokens/sec\n")
total_time += elapsed
total_tokens += generated_tokens
avg_speed = total_tokens / total_time if total_time > 0 else 0
print(f"Average speed: {avg_speed:.2f} tokens/sec")
if __name__ == "__main__":
# Run examples
print("Qwen ONNX Model - Advanced Examples\n")
# Example 1: Streaming
streaming_generation()
# Example 2: Batch processing
batch_processing()
# Example 3: Chat with system prompt
chat_with_system_prompt()
# Example 4: Temperature comparison
temperature_sampling_example()
# Example 5: Benchmark
benchmark_inference_speed()