{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Qwen-0.5B Model\n", "> Quantized and Non-Quantized" ] }, { "cell_type": "code", "execution_count": 1, "id": "3ec71d43", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/administrator/miniconda/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "===== Testing Qwen-0.5B (quantized) with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/qwen_onnx_quantized/model_quantized.onnx...\n", "Prompt: What is the capital of France?\n", "Response: What is the capital of France?\n", "The first step in the process of creating a new business is to identify the market. This is\n", "Generation time: 9.88 seconds\n", "\n", "===== Testing Qwen-0.5B (non-quantized) with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/qwen_onnx/model.onnx...\n", "Prompt: What is the capital of France?\n", "Response: What is the capital of France? Paris.\n", "Generation time: 19.48 seconds\n" ] } ], "source": [ "import numpy as np\n", "import onnxruntime as ort\n", "from transformers import AutoTokenizer\n", "import time\n", "\n", "def test_qwen_model_with_generation(model_type=\"quantized\", max_new_tokens=20):\n", " print(f\"\\n===== Testing Qwen-0.5B ({model_type}) with full generation =====\")\n", " start_time = time.time()\n", " \n", " # Set paths based on model type\n", " if model_type == \"quantized\":\n", " model_dir = \"onnx_models/qwen_onnx_quantized\"\n", " model_path = f\"{model_dir}/model_quantized.onnx\"\n", " else:\n", " model_dir = \"onnx_models/qwen_onnx\"\n", " model_path = f\"{model_dir}/model.onnx\"\n", " \n", " try:\n", " # Load tokenizer\n", " print(\"Loading tokenizer...\")\n", " tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen1.5-0.5B\")\n", " \n", " # Create ONNX Runtime session\n", " print(f\"Loading model from {model_path}...\")\n", " session = ort.InferenceSession(model_path)\n", " \n", " # Test prompt\n", " prompt = \"What is the capital of France?\"\n", " \n", " # Tokenize the input\n", " inputs = tokenizer(prompt, return_tensors=\"np\")\n", " input_ids = inputs[\"input_ids\"]\n", " \n", " # Get model input and output names\n", " input_names = [inp.name for inp in session.get_inputs()]\n", " output_names = [outp.name for outp in session.get_outputs()]\n", " \n", " # Start generation loop\n", " generated_ids = input_ids.copy()\n", " for i in range(max_new_tokens):\n", " # Create inputs for this step\n", " current_length = generated_ids.shape[1]\n", " attention_mask = np.ones((1, current_length), dtype=np.int64)\n", " position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n", " \n", " # Create past KV caches (empty for first iteration)\n", " batch_size = 1\n", " num_heads = 16 # For Qwen-0.5B\n", " head_dim = 64 # For Qwen-0.5B\n", " seq_len = 0 if i == 0 else current_length - 1\n", " \n", " ort_inputs = {\n", " \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n", " \"attention_mask\": attention_mask,\n", " }\n", " \n", " if \"position_ids\" in input_names:\n", " ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n", " \n", " for layer_idx in range(24): # Qwen-0.5B has 24 layers\n", " if f\"past_key_values.{layer_idx}.key\" in input_names:\n", " if i == 0: # First iteration - empty past\n", " empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n", " else: # Use past from previous iteration\n", " past_key_name = f\"present.{layer_idx}.key\"\n", " past_value_name = f\"present.{layer_idx}.value\"\n", " # Find the index of the past states in the outputs\n", " past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n", " past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n", " \n", " if past_key_idx is not None and past_value_idx is not None:\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n", " \n", " # Run inference\n", " outputs = session.run(output_names, ort_inputs)\n", " previous_outputs = outputs # Save for next iteration\n", " \n", " # Find the logits output\n", " logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n", " logits = outputs[logits_idx]\n", " \n", " # Get next token with greedy sampling\n", " next_token_id = np.argmax(logits[0, -1, :])\n", " \n", " # Add token to generated_ids\n", " generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n", " \n", " # Check for end of sequence token\n", " if next_token_id == tokenizer.eos_token_id:\n", " break\n", " \n", " # Get the generated text\n", " generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", " \n", " print(f\"Prompt: {prompt}\")\n", " print(f\"Response: {generated_text}\")\n", " print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n", " \n", " except Exception as e:\n", " print(f\"Error testing Qwen model: {str(e)}\")\n", " import traceback\n", " traceback.print_exc()\n", "\n", "if __name__ == \"__main__\":\n", " # Test both quantized and non-quantized versions\n", " test_qwen_model_with_generation(\"quantized\")\n", " test_qwen_model_with_generation(\"non-quantized\")" ] }, { "cell_type": "markdown", "id": "e2cd5ad3", "metadata": {}, "source": [ "----" ] }, { "cell_type": "markdown", "id": "0100fd67", "metadata": {}, "source": [ "### TinyLlama-1.1B\n", "> Quantized and Non-Quantized" ] }, { "cell_type": "code", "execution_count": 2, "id": "09af0532", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "===== Testing TinyLlama-1.1B (quantized) with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/tinyllama_onnx_quantized/model_quantized.onnx...\n", "Expected shape for past_key_values.0.key: ['batch_size', 4, 'past_sequence_length', 64]\n", "Prompt: <|system|>\n", "You are a helpful assistant.\n", "<|user|>\n", "What is the capital of France?<|assistant|>\n", "Response: <|system|>\n", "You are a helpful assistant.\n", "<|user|>\n", "What is the capital of France?<|assistant|>\n", "France is the capital of France.\n", "Generation time: 11.95 seconds\n", "\n", "===== Testing TinyLlama-1.1B (non-quantized) with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/tinyllama_onnx/model.onnx...\n", "Expected shape for past_key_values.0.key: ['batch_size', 4, 'past_sequence_length', 64]\n", "Prompt: <|system|>\n", "You are a helpful assistant.\n", "<|user|>\n", "What is the capital of France?<|assistant|>\n", "Response: <|system|>\n", "You are a helpful assistant.\n", "<|user|>\n", "What is the capital of France?<|assistant|>\n", "The capital of France is Paris.\n", "Generation time: 19.04 seconds\n" ] } ], "source": [ "import numpy as np\n", "import onnxruntime as ort\n", "from transformers import AutoTokenizer\n", "import time\n", "\n", "def test_tinyllama_model_with_generation(model_type=\"quantized\", max_new_tokens=20):\n", " print(f\"\\n===== Testing TinyLlama-1.1B ({model_type}) with full generation =====\")\n", " start_time = time.time()\n", " \n", " # Set paths based on model type\n", " if model_type == \"quantized\":\n", " model_dir = \"onnx_models/tinyllama_onnx_quantized\"\n", " model_path = f\"{model_dir}/model_quantized.onnx\"\n", " else:\n", " model_dir = \"onnx_models/tinyllama_onnx\"\n", " model_path = f\"{model_dir}/model.onnx\"\n", " \n", " try:\n", " # Load tokenizer\n", " print(\"Loading tokenizer...\")\n", " tokenizer = AutoTokenizer.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\")\n", " \n", " # Create ONNX Runtime session\n", " print(f\"Loading model from {model_path}...\")\n", " session = ort.InferenceSession(model_path)\n", " \n", " # Get input details to check the expected dimensions\n", " input_details = {inp.name: inp for inp in session.get_inputs()}\n", " \n", " # Print the first few input shapes to understand the expected dimensions\n", " sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n", " if sample_past_key:\n", " print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n", " \n", " # Use the correct dimensions based on the model's expectations\n", " num_heads = 4 # Adjusted from the error message\n", " head_dim = 64 # Adjusted from the error message\n", " \n", " # Test prompt - format properly for chat model\n", " prompt = \"<|system|>\\nYou are a helpful assistant.\\n<|user|>\\nWhat is the capital of France?<|assistant|>\"\n", " \n", " # Tokenize the input\n", " inputs = tokenizer(prompt, return_tensors=\"np\")\n", " input_ids = inputs[\"input_ids\"]\n", " \n", " # Get model input and output names\n", " input_names = [inp.name for inp in session.get_inputs()]\n", " output_names = [outp.name for outp in session.get_outputs()]\n", " \n", " # Start generation loop\n", " generated_ids = input_ids.copy()\n", " for i in range(max_new_tokens):\n", " # Create inputs for this step\n", " current_length = generated_ids.shape[1]\n", " attention_mask = np.ones((1, current_length), dtype=np.int64)\n", " \n", " ort_inputs = {\n", " \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n", " \"attention_mask\": attention_mask,\n", " }\n", " \n", " if \"position_ids\" in input_names:\n", " position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n", " ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n", " \n", " # Create past KV caches (empty for first iteration)\n", " batch_size = 1\n", " seq_len = 0 if i == 0 else current_length - 1\n", " \n", " for layer_idx in range(22): # TinyLlama-1.1B has 22 layers\n", " if f\"past_key_values.{layer_idx}.key\" in input_names:\n", " if i == 0: # First iteration - empty past\n", " empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n", " else: # Use past from previous iteration\n", " past_key_name = f\"present.{layer_idx}.key\"\n", " past_value_name = f\"present.{layer_idx}.value\"\n", " # Find the index of the past states in the outputs\n", " past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n", " past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n", " \n", " if past_key_idx is not None and past_value_idx is not None:\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n", " \n", " # Run inference\n", " outputs = session.run(output_names, ort_inputs)\n", " previous_outputs = outputs # Save for next iteration\n", " \n", " # Find the logits output\n", " logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n", " logits = outputs[logits_idx]\n", " \n", " # Get next token with greedy sampling\n", " next_token_id = np.argmax(logits[0, -1, :])\n", " \n", " # Add token to generated_ids\n", " generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n", " \n", " # Check for end of sequence token\n", " if next_token_id == tokenizer.eos_token_id:\n", " break\n", " \n", " # Get the generated text\n", " generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", " \n", " print(f\"Prompt: {prompt}\")\n", " print(f\"Response: {generated_text}\")\n", " print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n", " \n", " except Exception as e:\n", " print(f\"Error testing TinyLlama model: {str(e)}\")\n", " import traceback\n", " traceback.print_exc()\n", "\n", "if __name__ == \"__main__\":\n", " # Test both quantized and non-quantized versions\n", " test_tinyllama_model_with_generation(\"quantized\")\n", " test_tinyllama_model_with_generation(\"non-quantized\")" ] }, { "cell_type": "markdown", "id": "0102ca54", "metadata": {}, "source": [ "-----" ] }, { "cell_type": "markdown", "id": "6ff82ad6", "metadata": {}, "source": [ "### Phi-1.5 Model" ] }, { "cell_type": "code", "execution_count": 3, "id": "7f9b2ccf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "===== Testing Phi-1.5 with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/phi_onnx/model.onnx...\n", "Expected shape for past_key_values.0.key: ['batch_size', 32, 'past_sequence_length', 64]\n", "Using num_heads=32, head_dim=64 based on model inputs\n", "Prompt: What is the capital of France?\n", "Response: What is the capital of France? The capital of France is Paris.\n", "\n", "Generation time: 32.94 seconds\n" ] } ], "source": [ "import numpy as np\n", "import onnxruntime as ort\n", "from transformers import AutoTokenizer\n", "import time\n", "\n", "def test_phi_model_with_generation(max_new_tokens=20):\n", " print(\"\\n===== Testing Phi-1.5 with full generation =====\")\n", " start_time = time.time()\n", " \n", " model_dir = \"onnx_models/phi_onnx\"\n", " model_path = f\"{model_dir}/model.onnx\"\n", " \n", " try:\n", " # Load tokenizer\n", " print(\"Loading tokenizer...\")\n", " tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-1_5\")\n", " \n", " # Create ONNX Runtime session\n", " print(f\"Loading model from {model_path}...\")\n", " session = ort.InferenceSession(model_path)\n", " \n", " # Get input details to check the expected dimensions\n", " input_details = {inp.name: inp for inp in session.get_inputs()}\n", " \n", " # Print the first few input shapes to understand the expected dimensions\n", " sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n", " if sample_past_key:\n", " print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n", " \n", " # Test prompt \n", " prompt = \"What is the capital of France?\"\n", " \n", " # Tokenize the input\n", " inputs = tokenizer(prompt, return_tensors=\"np\")\n", " input_ids = inputs[\"input_ids\"]\n", " \n", " # Get model input and output names\n", " input_names = [inp.name for inp in session.get_inputs()]\n", " output_names = [outp.name for outp in session.get_outputs()]\n", " \n", " # Determine the correct dimensions for this model\n", " num_heads = 32 # Default for Phi-1.5\n", " head_dim = 80 # Default for Phi-1.5\n", " \n", " # If we can determine from the model, use those values instead\n", " if sample_past_key and hasattr(sample_past_key, 'shape') and len(sample_past_key.shape) == 4:\n", " shape = sample_past_key.shape\n", " if isinstance(shape[1], int):\n", " num_heads = shape[1]\n", " if isinstance(shape[3], int):\n", " head_dim = shape[3]\n", " print(f\"Using num_heads={num_heads}, head_dim={head_dim} based on model inputs\")\n", " \n", " # Start generation loop\n", " generated_ids = input_ids.copy()\n", " for i in range(max_new_tokens):\n", " # Create inputs for this step\n", " current_length = generated_ids.shape[1]\n", " attention_mask = np.ones((1, current_length), dtype=np.int64)\n", " \n", " ort_inputs = {\n", " \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n", " \"attention_mask\": attention_mask,\n", " }\n", " \n", " if \"position_ids\" in input_names:\n", " position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n", " ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n", " \n", " # Create past KV caches (empty for first iteration)\n", " batch_size = 1\n", " seq_len = 0 if i == 0 else current_length - 1\n", " \n", " for layer_idx in range(24): # Phi-1.5 has 24 layers\n", " if f\"past_key_values.{layer_idx}.key\" in input_names:\n", " if i == 0: # First iteration - empty past\n", " empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n", " else: # Use past from previous iteration\n", " past_key_name = f\"present.{layer_idx}.key\"\n", " past_value_name = f\"present.{layer_idx}.value\"\n", " # Find the index of the past states in the outputs\n", " past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n", " past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n", " \n", " if past_key_idx is not None and past_value_idx is not None:\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n", " \n", " # Run inference\n", " outputs = session.run(output_names, ort_inputs)\n", " previous_outputs = outputs # Save for next iteration\n", " \n", " # Find the logits output\n", " logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n", " logits = outputs[logits_idx]\n", " \n", " # Get next token with greedy sampling\n", " next_token_id = np.argmax(logits[0, -1, :])\n", " \n", " # Add token to generated_ids\n", " generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n", " \n", " # Check for end of sequence token\n", " if next_token_id == tokenizer.eos_token_id:\n", " break\n", " \n", " # Get the generated text\n", " generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", " \n", " print(f\"Prompt: {prompt}\")\n", " print(f\"Response: {generated_text}\")\n", " print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n", " \n", " except Exception as e:\n", " print(f\"Error testing Phi model: {str(e)}\")\n", " import traceback\n", " traceback.print_exc()\n", "\n", "if __name__ == \"__main__\":\n", " test_phi_model_with_generation()" ] }, { "cell_type": "markdown", "id": "5ae87030", "metadata": {}, "source": [ "----" ] }, { "cell_type": "markdown", "id": "ebcf7bf5", "metadata": {}, "source": [ "### Falcon-1B Model" ] }, { "cell_type": "code", "execution_count": 4, "id": "dd19c2fa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "===== Testing Falcon-RW-1B with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/falcon_onnx/model.onnx...\n", "Expected shape for past_key_values.0.key: ['batch_size', 32, 'past_sequence_length', 64]\n", "Using num_heads=32, head_dim=64 based on model inputs\n", "Prompt: What is the capital of France?\n", "Response: What is the capital of France?\n", "The capital of France is Paris.\n", "What is the capital of France?\n", "The capital of\n", "Generation time: 47.70 seconds\n" ] } ], "source": [ "import numpy as np\n", "import onnxruntime as ort\n", "from transformers import AutoTokenizer\n", "import time\n", "\n", "def test_falcon_model_with_generation(max_new_tokens=20):\n", " print(\"\\n===== Testing Falcon-RW-1B with full generation =====\")\n", " start_time = time.time()\n", " \n", " model_dir = \"onnx_models/falcon_onnx\"\n", " model_path = f\"{model_dir}/model.onnx\"\n", " \n", " try:\n", " # Load tokenizer\n", " print(\"Loading tokenizer...\")\n", " tokenizer = AutoTokenizer.from_pretrained(\"tiiuae/falcon-rw-1b\")\n", " \n", " # Create ONNX Runtime session\n", " print(f\"Loading model from {model_path}...\")\n", " session = ort.InferenceSession(model_path)\n", " \n", " # Get input details to check the expected dimensions\n", " input_details = {inp.name: inp for inp in session.get_inputs()}\n", " \n", " # Print the first few input shapes to understand the expected dimensions\n", " sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n", " if sample_past_key:\n", " print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n", " \n", " # Test prompt \n", " prompt = \"What is the capital of France?\"\n", " \n", " # Tokenize the input\n", " inputs = tokenizer(prompt, return_tensors=\"np\")\n", " input_ids = inputs[\"input_ids\"]\n", " \n", " # Get model input and output names\n", " input_names = [inp.name for inp in session.get_inputs()]\n", " output_names = [outp.name for outp in session.get_outputs()]\n", " \n", " # Determine the correct dimensions for this model\n", " num_heads = 16 # Default for Falcon-RW-1B\n", " head_dim = 64 # Default for Falcon-RW-1B\n", " \n", " # If we can determine from the model, use those values instead\n", " if sample_past_key and hasattr(sample_past_key, 'shape') and len(sample_past_key.shape) == 4:\n", " shape = sample_past_key.shape\n", " if isinstance(shape[1], int):\n", " num_heads = shape[1]\n", " if isinstance(shape[3], int):\n", " head_dim = shape[3]\n", " print(f\"Using num_heads={num_heads}, head_dim={head_dim} based on model inputs\")\n", " \n", " # Start generation loop\n", " generated_ids = input_ids.copy()\n", " for i in range(max_new_tokens):\n", " # Create inputs for this step\n", " current_length = generated_ids.shape[1]\n", " attention_mask = np.ones((1, current_length), dtype=np.int64)\n", " \n", " ort_inputs = {\n", " \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n", " \"attention_mask\": attention_mask,\n", " }\n", " \n", " if \"position_ids\" in input_names:\n", " position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n", " ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n", " \n", " # Create past KV caches (empty for first iteration)\n", " batch_size = 1\n", " seq_len = 0 if i == 0 else current_length - 1\n", " \n", " for layer_idx in range(24): # Falcon has 24 layers\n", " if f\"past_key_values.{layer_idx}.key\" in input_names:\n", " if i == 0: # First iteration - empty past\n", " empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n", " else: # Use past from previous iteration\n", " past_key_name = f\"present.{layer_idx}.key\"\n", " past_value_name = f\"present.{layer_idx}.value\"\n", " # Find the index of the past states in the outputs\n", " past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n", " past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n", " \n", " if past_key_idx is not None and past_value_idx is not None:\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n", " \n", " # Run inference\n", " outputs = session.run(output_names, ort_inputs)\n", " previous_outputs = outputs # Save for next iteration\n", " \n", " # Find the logits output\n", " logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n", " logits = outputs[logits_idx]\n", " \n", " # Get next token with greedy sampling\n", " next_token_id = np.argmax(logits[0, -1, :])\n", " \n", " # Add token to generated_ids\n", " generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n", " \n", " # Check for end of sequence token\n", " if next_token_id == tokenizer.eos_token_id:\n", " break\n", " \n", " # Get the generated text\n", " generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", " \n", " print(f\"Prompt: {prompt}\")\n", " print(f\"Response: {generated_text}\")\n", " print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n", " \n", " except Exception as e:\n", " print(f\"Error testing Falcon model: {str(e)}\")\n", " import traceback\n", " traceback.print_exc()\n", "\n", "if __name__ == \"__main__\":\n", " test_falcon_model_with_generation()" ] }, { "cell_type": "markdown", "id": "15da10c3", "metadata": {}, "source": [ "----" ] }, { "cell_type": "markdown", "id": "8e66af2c", "metadata": {}, "source": [ "### GPT-2Medium Model\n", "> Quantized and Non-Quantized" ] }, { "cell_type": "code", "execution_count": 5, "id": "1d624f42", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "===== Testing GPT2-Medium (quantized) with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/gpt2_onnx_quantized/model_quantized.onnx...\n", "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n", "Using num_heads=16, head_dim=64 based on model inputs\n", "Prompt: What is the capital of France?\n", "Response: What is the capital of France?\n", "\n", "The capital of France is Paris. The capital of France is the capital of France. The\n", "Generation time: 5.82 seconds\n", "\n", "===== Testing GPT2-Medium (non-quantized) with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/gpt2_onnx/model.onnx...\n", "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n", "Using num_heads=16, head_dim=64 based on model inputs\n", "Prompt: What is the capital of France?\n", "Response: What is the capital of France?\n", "\n", "The capital of France is Paris.\n", "\n", "What is the capital of France?\n", "\n", "\n", "Generation time: 14.06 seconds\n" ] } ], "source": [ "import numpy as np\n", "import onnxruntime as ort\n", "from transformers import AutoTokenizer\n", "import time\n", "\n", "def test_gpt2_model_with_generation(model_type=\"quantized\", max_new_tokens=20):\n", " print(f\"\\n===== Testing GPT2-Medium ({model_type}) with full generation =====\")\n", " start_time = time.time()\n", " \n", " # Set paths based on model type\n", " if model_type == \"quantized\":\n", " model_dir = \"onnx_models/gpt2_onnx_quantized\"\n", " model_path = f\"{model_dir}/model_quantized.onnx\"\n", " else:\n", " model_dir = \"onnx_models/gpt2_onnx\"\n", " model_path = f\"{model_dir}/model.onnx\"\n", " \n", " try:\n", " # Load tokenizer\n", " print(\"Loading tokenizer...\")\n", " tokenizer = AutoTokenizer.from_pretrained(\"gpt2-medium\")\n", " \n", " # Create ONNX Runtime session\n", " print(f\"Loading model from {model_path}...\")\n", " session = ort.InferenceSession(model_path)\n", " \n", " # Get input details to check the expected dimensions\n", " input_details = {inp.name: inp for inp in session.get_inputs()}\n", " \n", " # Print the first few input shapes to understand the expected dimensions\n", " sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n", " if sample_past_key:\n", " print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n", " \n", " # Test prompt \n", " prompt = \"What is the capital of France?\"\n", " \n", " # Tokenize the input\n", " inputs = tokenizer(prompt, return_tensors=\"np\")\n", " input_ids = inputs[\"input_ids\"]\n", " \n", " # Get model input and output names\n", " input_names = [inp.name for inp in session.get_inputs()]\n", " output_names = [outp.name for outp in session.get_outputs()]\n", " \n", " # Determine the correct dimensions for this model\n", " num_heads = 16 # Default for GPT2-Medium\n", " head_dim = 64 # Default for GPT2-Medium\n", " \n", " # If we can determine from the model, use those values instead\n", " if sample_past_key and hasattr(sample_past_key, 'shape') and len(sample_past_key.shape) == 4:\n", " shape = sample_past_key.shape\n", " if isinstance(shape[1], int):\n", " num_heads = shape[1]\n", " if isinstance(shape[3], int):\n", " head_dim = shape[3]\n", " print(f\"Using num_heads={num_heads}, head_dim={head_dim} based on model inputs\")\n", " \n", " # Start generation loop\n", " generated_ids = input_ids.copy()\n", " for i in range(max_new_tokens):\n", " # Create inputs for this step\n", " current_length = generated_ids.shape[1]\n", " attention_mask = np.ones((1, current_length), dtype=np.int64)\n", " \n", " ort_inputs = {\n", " \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n", " \"attention_mask\": attention_mask,\n", " }\n", " \n", " if \"position_ids\" in input_names:\n", " position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n", " ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n", " \n", " # Create past KV caches (empty for first iteration)\n", " batch_size = 1\n", " seq_len = 0 if i == 0 else current_length - 1\n", " \n", " for layer_idx in range(24): # GPT2-Medium has 24 layers\n", " if f\"past_key_values.{layer_idx}.key\" in input_names:\n", " if i == 0: # First iteration - empty past\n", " empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n", " else: # Use past from previous iteration\n", " past_key_name = f\"present.{layer_idx}.key\"\n", " past_value_name = f\"present.{layer_idx}.value\"\n", " # Find the index of the past states in the outputs\n", " past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n", " past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n", " \n", " if past_key_idx is not None and past_value_idx is not None:\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n", " \n", " # Run inference\n", " outputs = session.run(output_names, ort_inputs)\n", " previous_outputs = outputs # Save for next iteration\n", " \n", " # Find the logits output\n", " logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n", " logits = outputs[logits_idx]\n", " \n", " # Get next token with greedy sampling\n", " next_token_id = np.argmax(logits[0, -1, :])\n", " \n", " # Add token to generated_ids\n", " generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n", " \n", " # Check for end of sequence token\n", " if next_token_id == tokenizer.eos_token_id:\n", " break\n", " \n", " # Get the generated text\n", " generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", " \n", " print(f\"Prompt: {prompt}\")\n", " print(f\"Response: {generated_text}\")\n", " print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n", " \n", " except Exception as e:\n", " print(f\"Error testing GPT2 model: {str(e)}\")\n", " import traceback\n", " traceback.print_exc()\n", "\n", "if __name__ == \"__main__\":\n", " # Test both quantized and non-quantized versions\n", " test_gpt2_model_with_generation(\"quantized\")\n", " test_gpt2_model_with_generation(\"non-quantized\")" ] }, { "cell_type": "markdown", "id": "9d2876c8", "metadata": {}, "source": [ "---" ] }, { "cell_type": "markdown", "id": "aad7b6c9", "metadata": {}, "source": [ "### OPT-350M Model\n", "> Quantized and Non-Quantized" ] }, { "cell_type": "code", "execution_count": 6, "id": "04a4ef3a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "===== Testing OPT-350M (quantized) with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/opt_onnx_quantized/model_quantized.onnx...\n", "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n", "Using num_heads=16, head_dim=64 based on model inputs\n", "Prompt: What is the capital of France?\n", "Response: What is the capital of France?\n", "\n", "The capital of France is Paris.\n", "\n", "The city of Paris is the largest city in\n", "Generation time: 5.20 seconds\n", "\n", "===== Testing OPT-350M (non-quantized) with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/opt_onnx/model.onnx...\n", "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n", "Using num_heads=16, head_dim=64 based on model inputs\n", "Prompt: What is the capital of France?\n", "Response: What is the capital of France?\n", "The capital of France is Paris.\n", "I thought it was Paris.\n", "It's the capital\n", "Generation time: 10.08 seconds\n" ] } ], "source": [ "import numpy as np\n", "import onnxruntime as ort\n", "from transformers import AutoTokenizer\n", "import time\n", "\n", "def test_opt_model_with_generation(model_type=\"quantized\", max_new_tokens=20):\n", " print(f\"\\n===== Testing OPT-350M ({model_type}) with full generation =====\")\n", " start_time = time.time()\n", " \n", " # Set paths based on model type\n", " if model_type == \"quantized\":\n", " model_dir = \"onnx_models/opt_onnx_quantized\"\n", " model_path = f\"{model_dir}/model_quantized.onnx\"\n", " else:\n", " model_dir = \"onnx_models/opt_onnx\"\n", " model_path = f\"{model_dir}/model.onnx\"\n", " \n", " try:\n", " # Load tokenizer\n", " print(\"Loading tokenizer...\")\n", " tokenizer = AutoTokenizer.from_pretrained(\"facebook/opt-350m\")\n", " \n", " # Create ONNX Runtime session\n", " print(f\"Loading model from {model_path}...\")\n", " session = ort.InferenceSession(model_path)\n", " \n", " # Get input details to check the expected dimensions\n", " input_details = {inp.name: inp for inp in session.get_inputs()}\n", " \n", " # Print the first few input shapes to understand the expected dimensions\n", " sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n", " if sample_past_key:\n", " print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n", " \n", " # Test prompt \n", " prompt = \"What is the capital of France?\"\n", " \n", " # Tokenize the input\n", " inputs = tokenizer(prompt, return_tensors=\"np\")\n", " input_ids = inputs[\"input_ids\"]\n", " \n", " # Get model input and output names\n", " input_names = [inp.name for inp in session.get_inputs()]\n", " output_names = [outp.name for outp in session.get_outputs()]\n", " \n", " # Determine the correct dimensions for this model\n", " num_heads = 16 # Default for OPT-350M\n", " head_dim = 64 # Default for OPT-350M\n", " \n", " # If we can determine from the model, use those values instead\n", " if sample_past_key and hasattr(sample_past_key, 'shape') and len(sample_past_key.shape) == 4:\n", " shape = sample_past_key.shape\n", " if isinstance(shape[1], int):\n", " num_heads = shape[1]\n", " if isinstance(shape[3], int):\n", " head_dim = shape[3]\n", " print(f\"Using num_heads={num_heads}, head_dim={head_dim} based on model inputs\")\n", " \n", " # Start generation loop\n", " generated_ids = input_ids.copy()\n", " for i in range(max_new_tokens):\n", " # Create inputs for this step\n", " current_length = generated_ids.shape[1]\n", " attention_mask = np.ones((1, current_length), dtype=np.int64)\n", " \n", " ort_inputs = {\n", " \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n", " \"attention_mask\": attention_mask,\n", " }\n", " \n", " if \"position_ids\" in input_names:\n", " position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n", " ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n", " \n", " # Create past KV caches (empty for first iteration)\n", " batch_size = 1\n", " seq_len = 0 if i == 0 else current_length - 1\n", " \n", " for layer_idx in range(24): # OPT-350M has 24 layers typically\n", " if f\"past_key_values.{layer_idx}.key\" in input_names:\n", " if i == 0: # First iteration - empty past\n", " empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n", " else: # Use past from previous iteration\n", " past_key_name = f\"present.{layer_idx}.key\"\n", " past_value_name = f\"present.{layer_idx}.value\"\n", " # Find the index of the past states in the outputs\n", " past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n", " past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n", " \n", " if past_key_idx is not None and past_value_idx is not None:\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n", " \n", " # Run inference\n", " outputs = session.run(output_names, ort_inputs)\n", " previous_outputs = outputs # Save for next iteration\n", " \n", " # Find the logits output\n", " logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n", " logits = outputs[logits_idx]\n", " \n", " # Get next token with greedy sampling\n", " next_token_id = np.argmax(logits[0, -1, :])\n", " \n", " # Add token to generated_ids\n", " generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n", " \n", " # Check for end of sequence token\n", " if next_token_id == tokenizer.eos_token_id:\n", " break\n", " \n", " # Get the generated text\n", " generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", " \n", " print(f\"Prompt: {prompt}\")\n", " print(f\"Response: {generated_text}\")\n", " print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n", " \n", " except Exception as e:\n", " print(f\"Error testing OPT model: {str(e)}\")\n", " import traceback\n", " traceback.print_exc()\n", "\n", "if __name__ == \"__main__\":\n", " # Test both quantized and non-quantized versions\n", " test_opt_model_with_generation(\"quantized\")\n", " test_opt_model_with_generation(\"non-quantized\")" ] }, { "cell_type": "markdown", "id": "9343e46d", "metadata": {}, "source": [ "----" ] }, { "cell_type": "markdown", "id": "9ff2be63", "metadata": {}, "source": [ "### Bloom-560M Model\n", "> Quantized and Non-Quantized" ] }, { "cell_type": "code", "execution_count": 7, "id": "3922e901", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "===== Testing Bloom-560M (quantized) with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/bloom_onnx_quantized/model_quantized.onnx...\n", "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n", "Using num_heads=16, head_dim=64 based on model inputs\n", "Prompt: What is the capital of France?\n", "Response: What is the capital of France? [Answer is 'Paris' . [Answer is 'maillots de Saint Roux ' .\n", "Generation time: 10.20 seconds\n", "\n", "===== Testing Bloom-560M (non-quantized) with full generation =====\n", "Loading tokenizer...\n", "Loading model from onnx_models/bloom_onnx/model.onnx...\n", "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n", "Using num_heads=16, head_dim=64 based on model inputs\n", "Prompt: What is the capital of France?\n", "Response: What is the capital of France?\"\n", "\n", "\"It is Paris,\" said the Frenchman, with a smile.\n", "\n", "\"It is\n", "Generation time: 32.07 seconds\n" ] } ], "source": [ "import numpy as np\n", "import onnxruntime as ort\n", "from transformers import AutoTokenizer\n", "import time\n", "\n", "def test_bloom_model_with_generation(model_type=\"quantized\", max_new_tokens=20):\n", " print(f\"\\n===== Testing Bloom-560M ({model_type}) with full generation =====\")\n", " start_time = time.time()\n", " \n", " # Set paths based on model type\n", " if model_type == \"quantized\":\n", " model_dir = \"onnx_models/bloom_onnx_quantized\"\n", " model_path = f\"{model_dir}/model_quantized.onnx\"\n", " else:\n", " model_dir = \"onnx_models/bloom_onnx\"\n", " model_path = f\"{model_dir}/model.onnx\"\n", " \n", " try:\n", " # Load tokenizer\n", " print(\"Loading tokenizer...\")\n", " tokenizer = AutoTokenizer.from_pretrained(\"bigscience/bloom-560m\")\n", " \n", " # Create ONNX Runtime session\n", " print(f\"Loading model from {model_path}...\")\n", " session = ort.InferenceSession(model_path)\n", " \n", " # Get input details to check the expected dimensions\n", " input_details = {inp.name: inp for inp in session.get_inputs()}\n", " \n", " # Print the first few input shapes to understand the expected dimensions\n", " sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n", " if sample_past_key:\n", " print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n", " \n", " # Test prompt \n", " prompt = \"What is the capital of France?\"\n", " \n", " # Tokenize the input\n", " inputs = tokenizer(prompt, return_tensors=\"np\")\n", " input_ids = inputs[\"input_ids\"]\n", " \n", " # Get model input and output names\n", " input_names = [inp.name for inp in session.get_inputs()]\n", " output_names = [outp.name for outp in session.get_outputs()]\n", " \n", " # Determine the correct dimensions for this model\n", " num_heads = 16 # Default for Bloom-560M\n", " head_dim = 64 # Default for Bloom-560M\n", " \n", " # If we can determine from the model, use those values instead\n", " if sample_past_key and hasattr(sample_past_key, 'shape') and len(sample_past_key.shape) == 4:\n", " shape = sample_past_key.shape\n", " if isinstance(shape[1], int):\n", " num_heads = shape[1]\n", " if isinstance(shape[3], int):\n", " head_dim = shape[3]\n", " print(f\"Using num_heads={num_heads}, head_dim={head_dim} based on model inputs\")\n", " \n", " # Start generation loop\n", " generated_ids = input_ids.copy()\n", " for i in range(max_new_tokens):\n", " # Create inputs for this step\n", " current_length = generated_ids.shape[1]\n", " attention_mask = np.ones((1, current_length), dtype=np.int64)\n", " \n", " ort_inputs = {\n", " \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n", " \"attention_mask\": attention_mask,\n", " }\n", " \n", " if \"position_ids\" in input_names:\n", " position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n", " ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n", " \n", " # Create past KV caches (empty for first iteration)\n", " batch_size = 1\n", " seq_len = 0 if i == 0 else current_length - 1\n", " \n", " for layer_idx in range(24): # Bloom-560M typically has 24 layers\n", " if f\"past_key_values.{layer_idx}.key\" in input_names:\n", " if i == 0: # First iteration - empty past\n", " empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n", " else: # Use past from previous iteration\n", " past_key_name = f\"present.{layer_idx}.key\"\n", " past_value_name = f\"present.{layer_idx}.value\"\n", " # Find the index of the past states in the outputs\n", " past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n", " past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n", " \n", " if past_key_idx is not None and past_value_idx is not None:\n", " ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n", " ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n", " \n", " # Run inference\n", " outputs = session.run(output_names, ort_inputs)\n", " previous_outputs = outputs # Save for next iteration\n", " \n", " # Find the logits output\n", " logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n", " logits = outputs[logits_idx]\n", " \n", " # Get next token with greedy sampling\n", " next_token_id = np.argmax(logits[0, -1, :])\n", " \n", " # Add token to generated_ids\n", " generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n", " \n", " # Check for end of sequence token\n", " if next_token_id == tokenizer.eos_token_id:\n", " break\n", " \n", " # Get the generated text\n", " generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n", " \n", " print(f\"Prompt: {prompt}\")\n", " print(f\"Response: {generated_text}\")\n", " print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n", " \n", " except Exception as e:\n", " print(f\"Error testing Bloom model: {str(e)}\")\n", " import traceback\n", " traceback.print_exc()\n", "\n", "if __name__ == \"__main__\":\n", " # Test both quantized and non-quantized versions\n", " test_bloom_model_with_generation(\"quantized\")\n", " test_bloom_model_with_generation(\"non-quantized\")" ] }, { "cell_type": "markdown", "id": "ab4bee77", "metadata": {}, "source": [ "---" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 5 }