{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Qwen-0.5B Model\n",
    "> Quantized and Non-Quantized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "3ec71d43",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/administrator/miniconda/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "===== Testing Qwen-0.5B (quantized) with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/qwen_onnx_quantized/model_quantized.onnx...\n",
      "Prompt: What is the capital of France?\n",
      "Response: What is the capital of France?\n",
      "The first step in the process of creating a new business is to identify the market. This is\n",
      "Generation time: 9.88 seconds\n",
      "\n",
      "===== Testing Qwen-0.5B (non-quantized) with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/qwen_onnx/model.onnx...\n",
      "Prompt: What is the capital of France?\n",
      "Response: What is the capital of France? Paris.\n",
      "Generation time: 19.48 seconds\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import onnxruntime as ort\n",
    "from transformers import AutoTokenizer\n",
    "import time\n",
    "\n",
    "def test_qwen_model_with_generation(model_type=\"quantized\", max_new_tokens=20):\n",
    "    print(f\"\\n===== Testing Qwen-0.5B ({model_type}) with full generation =====\")\n",
    "    start_time = time.time()\n",
    "    \n",
    "    # Set paths based on model type\n",
    "    if model_type == \"quantized\":\n",
    "        model_dir = \"onnx_models/qwen_onnx_quantized\"\n",
    "        model_path = f\"{model_dir}/model_quantized.onnx\"\n",
    "    else:\n",
    "        model_dir = \"onnx_models/qwen_onnx\"\n",
    "        model_path = f\"{model_dir}/model.onnx\"\n",
    "    \n",
    "    try:\n",
    "        # Load tokenizer\n",
    "        print(\"Loading tokenizer...\")\n",
    "        tokenizer = AutoTokenizer.from_pretrained(\"Qwen/Qwen1.5-0.5B\")\n",
    "        \n",
    "        # Create ONNX Runtime session\n",
    "        print(f\"Loading model from {model_path}...\")\n",
    "        session = ort.InferenceSession(model_path)\n",
    "        \n",
    "        # Test prompt\n",
    "        prompt = \"What is the capital of France?\"\n",
    "        \n",
    "        # Tokenize the input\n",
    "        inputs = tokenizer(prompt, return_tensors=\"np\")\n",
    "        input_ids = inputs[\"input_ids\"]\n",
    "        \n",
    "        # Get model input and output names\n",
    "        input_names = [inp.name for inp in session.get_inputs()]\n",
    "        output_names = [outp.name for outp in session.get_outputs()]\n",
    "        \n",
    "        # Start generation loop\n",
    "        generated_ids = input_ids.copy()\n",
    "        for i in range(max_new_tokens):\n",
    "            # Create inputs for this step\n",
    "            current_length = generated_ids.shape[1]\n",
    "            attention_mask = np.ones((1, current_length), dtype=np.int64)\n",
    "            position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n",
    "            \n",
    "            # Create past KV caches (empty for first iteration)\n",
    "            batch_size = 1\n",
    "            num_heads = 16  # For Qwen-0.5B\n",
    "            head_dim = 64   # For Qwen-0.5B\n",
    "            seq_len = 0 if i == 0 else current_length - 1\n",
    "            \n",
    "            ort_inputs = {\n",
    "                \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n",
    "                \"attention_mask\": attention_mask,\n",
    "            }\n",
    "            \n",
    "            if \"position_ids\" in input_names:\n",
    "                ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n",
    "            \n",
    "            for layer_idx in range(24):  # Qwen-0.5B has 24 layers\n",
    "                if f\"past_key_values.{layer_idx}.key\" in input_names:\n",
    "                    if i == 0:  # First iteration - empty past\n",
    "                        empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n",
    "                    else:  # Use past from previous iteration\n",
    "                        past_key_name = f\"present.{layer_idx}.key\"\n",
    "                        past_value_name = f\"present.{layer_idx}.value\"\n",
    "                        # Find the index of the past states in the outputs\n",
    "                        past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n",
    "                        past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n",
    "                        \n",
    "                        if past_key_idx is not None and past_value_idx is not None:\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n",
    "            \n",
    "            # Run inference\n",
    "            outputs = session.run(output_names, ort_inputs)\n",
    "            previous_outputs = outputs  # Save for next iteration\n",
    "            \n",
    "            # Find the logits output\n",
    "            logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n",
    "            logits = outputs[logits_idx]\n",
    "            \n",
    "            # Get next token with greedy sampling\n",
    "            next_token_id = np.argmax(logits[0, -1, :])\n",
    "            \n",
    "            # Add token to generated_ids\n",
    "            generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n",
    "            \n",
    "            # Check for end of sequence token\n",
    "            if next_token_id == tokenizer.eos_token_id:\n",
    "                break\n",
    "        \n",
    "        # Get the generated text\n",
    "        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
    "        \n",
    "        print(f\"Prompt: {prompt}\")\n",
    "        print(f\"Response: {generated_text}\")\n",
    "        print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error testing Qwen model: {str(e)}\")\n",
    "        import traceback\n",
    "        traceback.print_exc()\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    # Test both quantized and non-quantized versions\n",
    "    test_qwen_model_with_generation(\"quantized\")\n",
    "    test_qwen_model_with_generation(\"non-quantized\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e2cd5ad3",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0100fd67",
   "metadata": {},
   "source": [
    "### TinyLlama-1.1B\n",
    "> Quantized and Non-Quantized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "09af0532",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "===== Testing TinyLlama-1.1B (quantized) with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/tinyllama_onnx_quantized/model_quantized.onnx...\n",
      "Expected shape for past_key_values.0.key: ['batch_size', 4, 'past_sequence_length', 64]\n",
      "Prompt: <|system|>\n",
      "You are a helpful assistant.\n",
      "<|user|>\n",
      "What is the capital of France?<|assistant|>\n",
      "Response: <|system|>\n",
      "You are a helpful assistant.\n",
      "<|user|>\n",
      "What is the capital of France?<|assistant|>\n",
      "France is the capital of France.\n",
      "Generation time: 11.95 seconds\n",
      "\n",
      "===== Testing TinyLlama-1.1B (non-quantized) with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/tinyllama_onnx/model.onnx...\n",
      "Expected shape for past_key_values.0.key: ['batch_size', 4, 'past_sequence_length', 64]\n",
      "Prompt: <|system|>\n",
      "You are a helpful assistant.\n",
      "<|user|>\n",
      "What is the capital of France?<|assistant|>\n",
      "Response: <|system|>\n",
      "You are a helpful assistant.\n",
      "<|user|>\n",
      "What is the capital of France?<|assistant|>\n",
      "The capital of France is Paris.\n",
      "Generation time: 19.04 seconds\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import onnxruntime as ort\n",
    "from transformers import AutoTokenizer\n",
    "import time\n",
    "\n",
    "def test_tinyllama_model_with_generation(model_type=\"quantized\", max_new_tokens=20):\n",
    "    print(f\"\\n===== Testing TinyLlama-1.1B ({model_type}) with full generation =====\")\n",
    "    start_time = time.time()\n",
    "    \n",
    "    # Set paths based on model type\n",
    "    if model_type == \"quantized\":\n",
    "        model_dir = \"onnx_models/tinyllama_onnx_quantized\"\n",
    "        model_path = f\"{model_dir}/model_quantized.onnx\"\n",
    "    else:\n",
    "        model_dir = \"onnx_models/tinyllama_onnx\"\n",
    "        model_path = f\"{model_dir}/model.onnx\"\n",
    "    \n",
    "    try:\n",
    "        # Load tokenizer\n",
    "        print(\"Loading tokenizer...\")\n",
    "        tokenizer = AutoTokenizer.from_pretrained(\"TinyLlama/TinyLlama-1.1B-Chat-v1.0\")\n",
    "        \n",
    "        # Create ONNX Runtime session\n",
    "        print(f\"Loading model from {model_path}...\")\n",
    "        session = ort.InferenceSession(model_path)\n",
    "        \n",
    "        # Get input details to check the expected dimensions\n",
    "        input_details = {inp.name: inp for inp in session.get_inputs()}\n",
    "        \n",
    "        # Print the first few input shapes to understand the expected dimensions\n",
    "        sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n",
    "        if sample_past_key:\n",
    "            print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n",
    "        \n",
    "        # Use the correct dimensions based on the model's expectations\n",
    "        num_heads = 4  # Adjusted from the error message\n",
    "        head_dim = 64  # Adjusted from the error message\n",
    "        \n",
    "        # Test prompt - format properly for chat model\n",
    "        prompt = \"<|system|>\\nYou are a helpful assistant.\\n<|user|>\\nWhat is the capital of France?<|assistant|>\"\n",
    "        \n",
    "        # Tokenize the input\n",
    "        inputs = tokenizer(prompt, return_tensors=\"np\")\n",
    "        input_ids = inputs[\"input_ids\"]\n",
    "        \n",
    "        # Get model input and output names\n",
    "        input_names = [inp.name for inp in session.get_inputs()]\n",
    "        output_names = [outp.name for outp in session.get_outputs()]\n",
    "        \n",
    "        # Start generation loop\n",
    "        generated_ids = input_ids.copy()\n",
    "        for i in range(max_new_tokens):\n",
    "            # Create inputs for this step\n",
    "            current_length = generated_ids.shape[1]\n",
    "            attention_mask = np.ones((1, current_length), dtype=np.int64)\n",
    "            \n",
    "            ort_inputs = {\n",
    "                \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n",
    "                \"attention_mask\": attention_mask,\n",
    "            }\n",
    "            \n",
    "            if \"position_ids\" in input_names:\n",
    "                position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n",
    "                ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n",
    "            \n",
    "            # Create past KV caches (empty for first iteration)\n",
    "            batch_size = 1\n",
    "            seq_len = 0 if i == 0 else current_length - 1\n",
    "            \n",
    "            for layer_idx in range(22):  # TinyLlama-1.1B has 22 layers\n",
    "                if f\"past_key_values.{layer_idx}.key\" in input_names:\n",
    "                    if i == 0:  # First iteration - empty past\n",
    "                        empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n",
    "                    else:  # Use past from previous iteration\n",
    "                        past_key_name = f\"present.{layer_idx}.key\"\n",
    "                        past_value_name = f\"present.{layer_idx}.value\"\n",
    "                        # Find the index of the past states in the outputs\n",
    "                        past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n",
    "                        past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n",
    "                        \n",
    "                        if past_key_idx is not None and past_value_idx is not None:\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n",
    "            \n",
    "            # Run inference\n",
    "            outputs = session.run(output_names, ort_inputs)\n",
    "            previous_outputs = outputs  # Save for next iteration\n",
    "            \n",
    "            # Find the logits output\n",
    "            logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n",
    "            logits = outputs[logits_idx]\n",
    "            \n",
    "            # Get next token with greedy sampling\n",
    "            next_token_id = np.argmax(logits[0, -1, :])\n",
    "            \n",
    "            # Add token to generated_ids\n",
    "            generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n",
    "            \n",
    "            # Check for end of sequence token\n",
    "            if next_token_id == tokenizer.eos_token_id:\n",
    "                break\n",
    "        \n",
    "        # Get the generated text\n",
    "        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
    "        \n",
    "        print(f\"Prompt: {prompt}\")\n",
    "        print(f\"Response: {generated_text}\")\n",
    "        print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error testing TinyLlama model: {str(e)}\")\n",
    "        import traceback\n",
    "        traceback.print_exc()\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    # Test both quantized and non-quantized versions\n",
    "    test_tinyllama_model_with_generation(\"quantized\")\n",
    "    test_tinyllama_model_with_generation(\"non-quantized\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0102ca54",
   "metadata": {},
   "source": [
    "-----"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ff82ad6",
   "metadata": {},
   "source": [
    "### Phi-1.5 Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7f9b2ccf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "===== Testing Phi-1.5 with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/phi_onnx/model.onnx...\n",
      "Expected shape for past_key_values.0.key: ['batch_size', 32, 'past_sequence_length', 64]\n",
      "Using num_heads=32, head_dim=64 based on model inputs\n",
      "Prompt: What is the capital of France?\n",
      "Response: What is the capital of France? The capital of France is Paris.\n",
      "\n",
      "Generation time: 32.94 seconds\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import onnxruntime as ort\n",
    "from transformers import AutoTokenizer\n",
    "import time\n",
    "\n",
    "def test_phi_model_with_generation(max_new_tokens=20):\n",
    "    print(\"\\n===== Testing Phi-1.5 with full generation =====\")\n",
    "    start_time = time.time()\n",
    "    \n",
    "    model_dir = \"onnx_models/phi_onnx\"\n",
    "    model_path = f\"{model_dir}/model.onnx\"\n",
    "    \n",
    "    try:\n",
    "        # Load tokenizer\n",
    "        print(\"Loading tokenizer...\")\n",
    "        tokenizer = AutoTokenizer.from_pretrained(\"microsoft/phi-1_5\")\n",
    "        \n",
    "        # Create ONNX Runtime session\n",
    "        print(f\"Loading model from {model_path}...\")\n",
    "        session = ort.InferenceSession(model_path)\n",
    "        \n",
    "        # Get input details to check the expected dimensions\n",
    "        input_details = {inp.name: inp for inp in session.get_inputs()}\n",
    "        \n",
    "        # Print the first few input shapes to understand the expected dimensions\n",
    "        sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n",
    "        if sample_past_key:\n",
    "            print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n",
    "        \n",
    "        # Test prompt \n",
    "        prompt = \"What is the capital of France?\"\n",
    "        \n",
    "        # Tokenize the input\n",
    "        inputs = tokenizer(prompt, return_tensors=\"np\")\n",
    "        input_ids = inputs[\"input_ids\"]\n",
    "        \n",
    "        # Get model input and output names\n",
    "        input_names = [inp.name for inp in session.get_inputs()]\n",
    "        output_names = [outp.name for outp in session.get_outputs()]\n",
    "        \n",
    "        # Determine the correct dimensions for this model\n",
    "        num_heads = 32  # Default for Phi-1.5\n",
    "        head_dim = 80   # Default for Phi-1.5\n",
    "        \n",
    "        # If we can determine from the model, use those values instead\n",
    "        if sample_past_key and hasattr(sample_past_key, 'shape') and len(sample_past_key.shape) == 4:\n",
    "            shape = sample_past_key.shape\n",
    "            if isinstance(shape[1], int):\n",
    "                num_heads = shape[1]\n",
    "            if isinstance(shape[3], int):\n",
    "                head_dim = shape[3]\n",
    "            print(f\"Using num_heads={num_heads}, head_dim={head_dim} based on model inputs\")\n",
    "        \n",
    "        # Start generation loop\n",
    "        generated_ids = input_ids.copy()\n",
    "        for i in range(max_new_tokens):\n",
    "            # Create inputs for this step\n",
    "            current_length = generated_ids.shape[1]\n",
    "            attention_mask = np.ones((1, current_length), dtype=np.int64)\n",
    "            \n",
    "            ort_inputs = {\n",
    "                \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n",
    "                \"attention_mask\": attention_mask,\n",
    "            }\n",
    "            \n",
    "            if \"position_ids\" in input_names:\n",
    "                position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n",
    "                ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n",
    "            \n",
    "            # Create past KV caches (empty for first iteration)\n",
    "            batch_size = 1\n",
    "            seq_len = 0 if i == 0 else current_length - 1\n",
    "            \n",
    "            for layer_idx in range(24):  # Phi-1.5 has 24 layers\n",
    "                if f\"past_key_values.{layer_idx}.key\" in input_names:\n",
    "                    if i == 0:  # First iteration - empty past\n",
    "                        empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n",
    "                    else:  # Use past from previous iteration\n",
    "                        past_key_name = f\"present.{layer_idx}.key\"\n",
    "                        past_value_name = f\"present.{layer_idx}.value\"\n",
    "                        # Find the index of the past states in the outputs\n",
    "                        past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n",
    "                        past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n",
    "                        \n",
    "                        if past_key_idx is not None and past_value_idx is not None:\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n",
    "            \n",
    "            # Run inference\n",
    "            outputs = session.run(output_names, ort_inputs)\n",
    "            previous_outputs = outputs  # Save for next iteration\n",
    "            \n",
    "            # Find the logits output\n",
    "            logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n",
    "            logits = outputs[logits_idx]\n",
    "            \n",
    "            # Get next token with greedy sampling\n",
    "            next_token_id = np.argmax(logits[0, -1, :])\n",
    "            \n",
    "            # Add token to generated_ids\n",
    "            generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n",
    "            \n",
    "            # Check for end of sequence token\n",
    "            if next_token_id == tokenizer.eos_token_id:\n",
    "                break\n",
    "        \n",
    "        # Get the generated text\n",
    "        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
    "        \n",
    "        print(f\"Prompt: {prompt}\")\n",
    "        print(f\"Response: {generated_text}\")\n",
    "        print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error testing Phi model: {str(e)}\")\n",
    "        import traceback\n",
    "        traceback.print_exc()\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    test_phi_model_with_generation()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5ae87030",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ebcf7bf5",
   "metadata": {},
   "source": [
    "### Falcon-1B Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "dd19c2fa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "===== Testing Falcon-RW-1B with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/falcon_onnx/model.onnx...\n",
      "Expected shape for past_key_values.0.key: ['batch_size', 32, 'past_sequence_length', 64]\n",
      "Using num_heads=32, head_dim=64 based on model inputs\n",
      "Prompt: What is the capital of France?\n",
      "Response: What is the capital of France?\n",
      "The capital of France is Paris.\n",
      "What is the capital of France?\n",
      "The capital of\n",
      "Generation time: 47.70 seconds\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import onnxruntime as ort\n",
    "from transformers import AutoTokenizer\n",
    "import time\n",
    "\n",
    "def test_falcon_model_with_generation(max_new_tokens=20):\n",
    "    print(\"\\n===== Testing Falcon-RW-1B with full generation =====\")\n",
    "    start_time = time.time()\n",
    "    \n",
    "    model_dir = \"onnx_models/falcon_onnx\"\n",
    "    model_path = f\"{model_dir}/model.onnx\"\n",
    "    \n",
    "    try:\n",
    "        # Load tokenizer\n",
    "        print(\"Loading tokenizer...\")\n",
    "        tokenizer = AutoTokenizer.from_pretrained(\"tiiuae/falcon-rw-1b\")\n",
    "        \n",
    "        # Create ONNX Runtime session\n",
    "        print(f\"Loading model from {model_path}...\")\n",
    "        session = ort.InferenceSession(model_path)\n",
    "        \n",
    "        # Get input details to check the expected dimensions\n",
    "        input_details = {inp.name: inp for inp in session.get_inputs()}\n",
    "        \n",
    "        # Print the first few input shapes to understand the expected dimensions\n",
    "        sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n",
    "        if sample_past_key:\n",
    "            print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n",
    "        \n",
    "        # Test prompt \n",
    "        prompt = \"What is the capital of France?\"\n",
    "        \n",
    "        # Tokenize the input\n",
    "        inputs = tokenizer(prompt, return_tensors=\"np\")\n",
    "        input_ids = inputs[\"input_ids\"]\n",
    "        \n",
    "        # Get model input and output names\n",
    "        input_names = [inp.name for inp in session.get_inputs()]\n",
    "        output_names = [outp.name for outp in session.get_outputs()]\n",
    "        \n",
    "        # Determine the correct dimensions for this model\n",
    "        num_heads = 16  # Default for Falcon-RW-1B\n",
    "        head_dim = 64   # Default for Falcon-RW-1B\n",
    "        \n",
    "        # If we can determine from the model, use those values instead\n",
    "        if sample_past_key and hasattr(sample_past_key, 'shape') and len(sample_past_key.shape) == 4:\n",
    "            shape = sample_past_key.shape\n",
    "            if isinstance(shape[1], int):\n",
    "                num_heads = shape[1]\n",
    "            if isinstance(shape[3], int):\n",
    "                head_dim = shape[3]\n",
    "            print(f\"Using num_heads={num_heads}, head_dim={head_dim} based on model inputs\")\n",
    "        \n",
    "        # Start generation loop\n",
    "        generated_ids = input_ids.copy()\n",
    "        for i in range(max_new_tokens):\n",
    "            # Create inputs for this step\n",
    "            current_length = generated_ids.shape[1]\n",
    "            attention_mask = np.ones((1, current_length), dtype=np.int64)\n",
    "            \n",
    "            ort_inputs = {\n",
    "                \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n",
    "                \"attention_mask\": attention_mask,\n",
    "            }\n",
    "            \n",
    "            if \"position_ids\" in input_names:\n",
    "                position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n",
    "                ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n",
    "            \n",
    "            # Create past KV caches (empty for first iteration)\n",
    "            batch_size = 1\n",
    "            seq_len = 0 if i == 0 else current_length - 1\n",
    "            \n",
    "            for layer_idx in range(24):  # Falcon has 24 layers\n",
    "                if f\"past_key_values.{layer_idx}.key\" in input_names:\n",
    "                    if i == 0:  # First iteration - empty past\n",
    "                        empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n",
    "                    else:  # Use past from previous iteration\n",
    "                        past_key_name = f\"present.{layer_idx}.key\"\n",
    "                        past_value_name = f\"present.{layer_idx}.value\"\n",
    "                        # Find the index of the past states in the outputs\n",
    "                        past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n",
    "                        past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n",
    "                        \n",
    "                        if past_key_idx is not None and past_value_idx is not None:\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n",
    "            \n",
    "            # Run inference\n",
    "            outputs = session.run(output_names, ort_inputs)\n",
    "            previous_outputs = outputs  # Save for next iteration\n",
    "            \n",
    "            # Find the logits output\n",
    "            logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n",
    "            logits = outputs[logits_idx]\n",
    "            \n",
    "            # Get next token with greedy sampling\n",
    "            next_token_id = np.argmax(logits[0, -1, :])\n",
    "            \n",
    "            # Add token to generated_ids\n",
    "            generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n",
    "            \n",
    "            # Check for end of sequence token\n",
    "            if next_token_id == tokenizer.eos_token_id:\n",
    "                break\n",
    "        \n",
    "        # Get the generated text\n",
    "        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
    "        \n",
    "        print(f\"Prompt: {prompt}\")\n",
    "        print(f\"Response: {generated_text}\")\n",
    "        print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error testing Falcon model: {str(e)}\")\n",
    "        import traceback\n",
    "        traceback.print_exc()\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    test_falcon_model_with_generation()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "15da10c3",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8e66af2c",
   "metadata": {},
   "source": [
    "### GPT-2Medium Model\n",
    "> Quantized and Non-Quantized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "1d624f42",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "===== Testing GPT2-Medium (quantized) with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/gpt2_onnx_quantized/model_quantized.onnx...\n",
      "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n",
      "Using num_heads=16, head_dim=64 based on model inputs\n",
      "Prompt: What is the capital of France?\n",
      "Response: What is the capital of France?\n",
      "\n",
      "The capital of France is Paris. The capital of France is the capital of France. The\n",
      "Generation time: 5.82 seconds\n",
      "\n",
      "===== Testing GPT2-Medium (non-quantized) with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/gpt2_onnx/model.onnx...\n",
      "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n",
      "Using num_heads=16, head_dim=64 based on model inputs\n",
      "Prompt: What is the capital of France?\n",
      "Response: What is the capital of France?\n",
      "\n",
      "The capital of France is Paris.\n",
      "\n",
      "What is the capital of France?\n",
      "\n",
      "\n",
      "Generation time: 14.06 seconds\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import onnxruntime as ort\n",
    "from transformers import AutoTokenizer\n",
    "import time\n",
    "\n",
    "def test_gpt2_model_with_generation(model_type=\"quantized\", max_new_tokens=20):\n",
    "    print(f\"\\n===== Testing GPT2-Medium ({model_type}) with full generation =====\")\n",
    "    start_time = time.time()\n",
    "    \n",
    "    # Set paths based on model type\n",
    "    if model_type == \"quantized\":\n",
    "        model_dir = \"onnx_models/gpt2_onnx_quantized\"\n",
    "        model_path = f\"{model_dir}/model_quantized.onnx\"\n",
    "    else:\n",
    "        model_dir = \"onnx_models/gpt2_onnx\"\n",
    "        model_path = f\"{model_dir}/model.onnx\"\n",
    "    \n",
    "    try:\n",
    "        # Load tokenizer\n",
    "        print(\"Loading tokenizer...\")\n",
    "        tokenizer = AutoTokenizer.from_pretrained(\"gpt2-medium\")\n",
    "        \n",
    "        # Create ONNX Runtime session\n",
    "        print(f\"Loading model from {model_path}...\")\n",
    "        session = ort.InferenceSession(model_path)\n",
    "        \n",
    "        # Get input details to check the expected dimensions\n",
    "        input_details = {inp.name: inp for inp in session.get_inputs()}\n",
    "        \n",
    "        # Print the first few input shapes to understand the expected dimensions\n",
    "        sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n",
    "        if sample_past_key:\n",
    "            print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n",
    "        \n",
    "        # Test prompt \n",
    "        prompt = \"What is the capital of France?\"\n",
    "        \n",
    "        # Tokenize the input\n",
    "        inputs = tokenizer(prompt, return_tensors=\"np\")\n",
    "        input_ids = inputs[\"input_ids\"]\n",
    "        \n",
    "        # Get model input and output names\n",
    "        input_names = [inp.name for inp in session.get_inputs()]\n",
    "        output_names = [outp.name for outp in session.get_outputs()]\n",
    "        \n",
    "        # Determine the correct dimensions for this model\n",
    "        num_heads = 16  # Default for GPT2-Medium\n",
    "        head_dim = 64   # Default for GPT2-Medium\n",
    "        \n",
    "        # If we can determine from the model, use those values instead\n",
    "        if sample_past_key and hasattr(sample_past_key, 'shape') and len(sample_past_key.shape) == 4:\n",
    "            shape = sample_past_key.shape\n",
    "            if isinstance(shape[1], int):\n",
    "                num_heads = shape[1]\n",
    "            if isinstance(shape[3], int):\n",
    "                head_dim = shape[3]\n",
    "            print(f\"Using num_heads={num_heads}, head_dim={head_dim} based on model inputs\")\n",
    "        \n",
    "        # Start generation loop\n",
    "        generated_ids = input_ids.copy()\n",
    "        for i in range(max_new_tokens):\n",
    "            # Create inputs for this step\n",
    "            current_length = generated_ids.shape[1]\n",
    "            attention_mask = np.ones((1, current_length), dtype=np.int64)\n",
    "            \n",
    "            ort_inputs = {\n",
    "                \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n",
    "                \"attention_mask\": attention_mask,\n",
    "            }\n",
    "            \n",
    "            if \"position_ids\" in input_names:\n",
    "                position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n",
    "                ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n",
    "            \n",
    "            # Create past KV caches (empty for first iteration)\n",
    "            batch_size = 1\n",
    "            seq_len = 0 if i == 0 else current_length - 1\n",
    "            \n",
    "            for layer_idx in range(24):  # GPT2-Medium has 24 layers\n",
    "                if f\"past_key_values.{layer_idx}.key\" in input_names:\n",
    "                    if i == 0:  # First iteration - empty past\n",
    "                        empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n",
    "                    else:  # Use past from previous iteration\n",
    "                        past_key_name = f\"present.{layer_idx}.key\"\n",
    "                        past_value_name = f\"present.{layer_idx}.value\"\n",
    "                        # Find the index of the past states in the outputs\n",
    "                        past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n",
    "                        past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n",
    "                        \n",
    "                        if past_key_idx is not None and past_value_idx is not None:\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n",
    "            \n",
    "            # Run inference\n",
    "            outputs = session.run(output_names, ort_inputs)\n",
    "            previous_outputs = outputs  # Save for next iteration\n",
    "            \n",
    "            # Find the logits output\n",
    "            logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n",
    "            logits = outputs[logits_idx]\n",
    "            \n",
    "            # Get next token with greedy sampling\n",
    "            next_token_id = np.argmax(logits[0, -1, :])\n",
    "            \n",
    "            # Add token to generated_ids\n",
    "            generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n",
    "            \n",
    "            # Check for end of sequence token\n",
    "            if next_token_id == tokenizer.eos_token_id:\n",
    "                break\n",
    "        \n",
    "        # Get the generated text\n",
    "        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
    "        \n",
    "        print(f\"Prompt: {prompt}\")\n",
    "        print(f\"Response: {generated_text}\")\n",
    "        print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error testing GPT2 model: {str(e)}\")\n",
    "        import traceback\n",
    "        traceback.print_exc()\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    # Test both quantized and non-quantized versions\n",
    "    test_gpt2_model_with_generation(\"quantized\")\n",
    "    test_gpt2_model_with_generation(\"non-quantized\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9d2876c8",
   "metadata": {},
   "source": [
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aad7b6c9",
   "metadata": {},
   "source": [
    "### OPT-350M Model\n",
    "> Quantized and Non-Quantized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "04a4ef3a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "===== Testing OPT-350M (quantized) with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/opt_onnx_quantized/model_quantized.onnx...\n",
      "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n",
      "Using num_heads=16, head_dim=64 based on model inputs\n",
      "Prompt: What is the capital of France?\n",
      "Response: What is the capital of France?\n",
      "\n",
      "The capital of France is Paris.\n",
      "\n",
      "The city of Paris is the largest city in\n",
      "Generation time: 5.20 seconds\n",
      "\n",
      "===== Testing OPT-350M (non-quantized) with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/opt_onnx/model.onnx...\n",
      "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n",
      "Using num_heads=16, head_dim=64 based on model inputs\n",
      "Prompt: What is the capital of France?\n",
      "Response: What is the capital of France?\n",
      "The capital of France is Paris.\n",
      "I thought it was Paris.\n",
      "It's the capital\n",
      "Generation time: 10.08 seconds\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import onnxruntime as ort\n",
    "from transformers import AutoTokenizer\n",
    "import time\n",
    "\n",
    "def test_opt_model_with_generation(model_type=\"quantized\", max_new_tokens=20):\n",
    "    print(f\"\\n===== Testing OPT-350M ({model_type}) with full generation =====\")\n",
    "    start_time = time.time()\n",
    "    \n",
    "    # Set paths based on model type\n",
    "    if model_type == \"quantized\":\n",
    "        model_dir = \"onnx_models/opt_onnx_quantized\"\n",
    "        model_path = f\"{model_dir}/model_quantized.onnx\"\n",
    "    else:\n",
    "        model_dir = \"onnx_models/opt_onnx\"\n",
    "        model_path = f\"{model_dir}/model.onnx\"\n",
    "    \n",
    "    try:\n",
    "        # Load tokenizer\n",
    "        print(\"Loading tokenizer...\")\n",
    "        tokenizer = AutoTokenizer.from_pretrained(\"facebook/opt-350m\")\n",
    "        \n",
    "        # Create ONNX Runtime session\n",
    "        print(f\"Loading model from {model_path}...\")\n",
    "        session = ort.InferenceSession(model_path)\n",
    "        \n",
    "        # Get input details to check the expected dimensions\n",
    "        input_details = {inp.name: inp for inp in session.get_inputs()}\n",
    "        \n",
    "        # Print the first few input shapes to understand the expected dimensions\n",
    "        sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n",
    "        if sample_past_key:\n",
    "            print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n",
    "        \n",
    "        # Test prompt \n",
    "        prompt = \"What is the capital of France?\"\n",
    "        \n",
    "        # Tokenize the input\n",
    "        inputs = tokenizer(prompt, return_tensors=\"np\")\n",
    "        input_ids = inputs[\"input_ids\"]\n",
    "        \n",
    "        # Get model input and output names\n",
    "        input_names = [inp.name for inp in session.get_inputs()]\n",
    "        output_names = [outp.name for outp in session.get_outputs()]\n",
    "        \n",
    "        # Determine the correct dimensions for this model\n",
    "        num_heads = 16  # Default for OPT-350M\n",
    "        head_dim = 64   # Default for OPT-350M\n",
    "        \n",
    "        # If we can determine from the model, use those values instead\n",
    "        if sample_past_key and hasattr(sample_past_key, 'shape') and len(sample_past_key.shape) == 4:\n",
    "            shape = sample_past_key.shape\n",
    "            if isinstance(shape[1], int):\n",
    "                num_heads = shape[1]\n",
    "            if isinstance(shape[3], int):\n",
    "                head_dim = shape[3]\n",
    "            print(f\"Using num_heads={num_heads}, head_dim={head_dim} based on model inputs\")\n",
    "        \n",
    "        # Start generation loop\n",
    "        generated_ids = input_ids.copy()\n",
    "        for i in range(max_new_tokens):\n",
    "            # Create inputs for this step\n",
    "            current_length = generated_ids.shape[1]\n",
    "            attention_mask = np.ones((1, current_length), dtype=np.int64)\n",
    "            \n",
    "            ort_inputs = {\n",
    "                \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n",
    "                \"attention_mask\": attention_mask,\n",
    "            }\n",
    "            \n",
    "            if \"position_ids\" in input_names:\n",
    "                position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n",
    "                ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n",
    "            \n",
    "            # Create past KV caches (empty for first iteration)\n",
    "            batch_size = 1\n",
    "            seq_len = 0 if i == 0 else current_length - 1\n",
    "            \n",
    "            for layer_idx in range(24):  # OPT-350M has 24 layers typically\n",
    "                if f\"past_key_values.{layer_idx}.key\" in input_names:\n",
    "                    if i == 0:  # First iteration - empty past\n",
    "                        empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n",
    "                    else:  # Use past from previous iteration\n",
    "                        past_key_name = f\"present.{layer_idx}.key\"\n",
    "                        past_value_name = f\"present.{layer_idx}.value\"\n",
    "                        # Find the index of the past states in the outputs\n",
    "                        past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n",
    "                        past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n",
    "                        \n",
    "                        if past_key_idx is not None and past_value_idx is not None:\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n",
    "            \n",
    "            # Run inference\n",
    "            outputs = session.run(output_names, ort_inputs)\n",
    "            previous_outputs = outputs  # Save for next iteration\n",
    "            \n",
    "            # Find the logits output\n",
    "            logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n",
    "            logits = outputs[logits_idx]\n",
    "            \n",
    "            # Get next token with greedy sampling\n",
    "            next_token_id = np.argmax(logits[0, -1, :])\n",
    "            \n",
    "            # Add token to generated_ids\n",
    "            generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n",
    "            \n",
    "            # Check for end of sequence token\n",
    "            if next_token_id == tokenizer.eos_token_id:\n",
    "                break\n",
    "        \n",
    "        # Get the generated text\n",
    "        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
    "        \n",
    "        print(f\"Prompt: {prompt}\")\n",
    "        print(f\"Response: {generated_text}\")\n",
    "        print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error testing OPT model: {str(e)}\")\n",
    "        import traceback\n",
    "        traceback.print_exc()\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    # Test both quantized and non-quantized versions\n",
    "    test_opt_model_with_generation(\"quantized\")\n",
    "    test_opt_model_with_generation(\"non-quantized\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9343e46d",
   "metadata": {},
   "source": [
    "----"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ff2be63",
   "metadata": {},
   "source": [
    "### Bloom-560M Model\n",
    "> Quantized and Non-Quantized"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "3922e901",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "===== Testing Bloom-560M (quantized) with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/bloom_onnx_quantized/model_quantized.onnx...\n",
      "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n",
      "Using num_heads=16, head_dim=64 based on model inputs\n",
      "Prompt: What is the capital of France?\n",
      "Response: What is the capital of France? [Answer is 'Paris' . [Answer is 'maillots de Saint Roux ' .\n",
      "Generation time: 10.20 seconds\n",
      "\n",
      "===== Testing Bloom-560M (non-quantized) with full generation =====\n",
      "Loading tokenizer...\n",
      "Loading model from onnx_models/bloom_onnx/model.onnx...\n",
      "Expected shape for past_key_values.0.key: ['batch_size', 16, 'past_sequence_length', 64]\n",
      "Using num_heads=16, head_dim=64 based on model inputs\n",
      "Prompt: What is the capital of France?\n",
      "Response: What is the capital of France?\"\n",
      "\n",
      "\"It is Paris,\" said the Frenchman, with a smile.\n",
      "\n",
      "\"It is\n",
      "Generation time: 32.07 seconds\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import onnxruntime as ort\n",
    "from transformers import AutoTokenizer\n",
    "import time\n",
    "\n",
    "def test_bloom_model_with_generation(model_type=\"quantized\", max_new_tokens=20):\n",
    "    print(f\"\\n===== Testing Bloom-560M ({model_type}) with full generation =====\")\n",
    "    start_time = time.time()\n",
    "    \n",
    "    # Set paths based on model type\n",
    "    if model_type == \"quantized\":\n",
    "        model_dir = \"onnx_models/bloom_onnx_quantized\"\n",
    "        model_path = f\"{model_dir}/model_quantized.onnx\"\n",
    "    else:\n",
    "        model_dir = \"onnx_models/bloom_onnx\"\n",
    "        model_path = f\"{model_dir}/model.onnx\"\n",
    "    \n",
    "    try:\n",
    "        # Load tokenizer\n",
    "        print(\"Loading tokenizer...\")\n",
    "        tokenizer = AutoTokenizer.from_pretrained(\"bigscience/bloom-560m\")\n",
    "        \n",
    "        # Create ONNX Runtime session\n",
    "        print(f\"Loading model from {model_path}...\")\n",
    "        session = ort.InferenceSession(model_path)\n",
    "        \n",
    "        # Get input details to check the expected dimensions\n",
    "        input_details = {inp.name: inp for inp in session.get_inputs()}\n",
    "        \n",
    "        # Print the first few input shapes to understand the expected dimensions\n",
    "        sample_past_key = input_details.get(\"past_key_values.0.key\", None)\n",
    "        if sample_past_key:\n",
    "            print(f\"Expected shape for past_key_values.0.key: {sample_past_key.shape}\")\n",
    "        \n",
    "        # Test prompt \n",
    "        prompt = \"What is the capital of France?\"\n",
    "        \n",
    "        # Tokenize the input\n",
    "        inputs = tokenizer(prompt, return_tensors=\"np\")\n",
    "        input_ids = inputs[\"input_ids\"]\n",
    "        \n",
    "        # Get model input and output names\n",
    "        input_names = [inp.name for inp in session.get_inputs()]\n",
    "        output_names = [outp.name for outp in session.get_outputs()]\n",
    "        \n",
    "        # Determine the correct dimensions for this model\n",
    "        num_heads = 16  # Default for Bloom-560M\n",
    "        head_dim = 64   # Default for Bloom-560M\n",
    "        \n",
    "        # If we can determine from the model, use those values instead\n",
    "        if sample_past_key and hasattr(sample_past_key, 'shape') and len(sample_past_key.shape) == 4:\n",
    "            shape = sample_past_key.shape\n",
    "            if isinstance(shape[1], int):\n",
    "                num_heads = shape[1]\n",
    "            if isinstance(shape[3], int):\n",
    "                head_dim = shape[3]\n",
    "            print(f\"Using num_heads={num_heads}, head_dim={head_dim} based on model inputs\")\n",
    "        \n",
    "        # Start generation loop\n",
    "        generated_ids = input_ids.copy()\n",
    "        for i in range(max_new_tokens):\n",
    "            # Create inputs for this step\n",
    "            current_length = generated_ids.shape[1]\n",
    "            attention_mask = np.ones((1, current_length), dtype=np.int64)\n",
    "            \n",
    "            ort_inputs = {\n",
    "                \"input_ids\": generated_ids[:, -1:] if i > 0 else generated_ids,\n",
    "                \"attention_mask\": attention_mask,\n",
    "            }\n",
    "            \n",
    "            if \"position_ids\" in input_names:\n",
    "                position_ids = np.arange(current_length, dtype=np.int64).reshape(1, -1)\n",
    "                ort_inputs[\"position_ids\"] = position_ids[:, -1:] if i > 0 else position_ids\n",
    "            \n",
    "            # Create past KV caches (empty for first iteration)\n",
    "            batch_size = 1\n",
    "            seq_len = 0 if i == 0 else current_length - 1\n",
    "            \n",
    "            for layer_idx in range(24):  # Bloom-560M typically has 24 layers\n",
    "                if f\"past_key_values.{layer_idx}.key\" in input_names:\n",
    "                    if i == 0:  # First iteration - empty past\n",
    "                        empty_key = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        empty_value = np.zeros((batch_size, num_heads, seq_len, head_dim), dtype=np.float32)\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.key\"] = empty_key\n",
    "                        ort_inputs[f\"past_key_values.{layer_idx}.value\"] = empty_value\n",
    "                    else:  # Use past from previous iteration\n",
    "                        past_key_name = f\"present.{layer_idx}.key\"\n",
    "                        past_value_name = f\"present.{layer_idx}.value\"\n",
    "                        # Find the index of the past states in the outputs\n",
    "                        past_key_idx = output_names.index(past_key_name) if past_key_name in output_names else None\n",
    "                        past_value_idx = output_names.index(past_value_name) if past_value_name in output_names else None\n",
    "                        \n",
    "                        if past_key_idx is not None and past_value_idx is not None:\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.key\"] = previous_outputs[past_key_idx]\n",
    "                            ort_inputs[f\"past_key_values.{layer_idx}.value\"] = previous_outputs[past_value_idx]\n",
    "            \n",
    "            # Run inference\n",
    "            outputs = session.run(output_names, ort_inputs)\n",
    "            previous_outputs = outputs  # Save for next iteration\n",
    "            \n",
    "            # Find the logits output\n",
    "            logits_idx = next((i for i, name in enumerate(output_names) if 'logits' in name), 0)\n",
    "            logits = outputs[logits_idx]\n",
    "            \n",
    "            # Get next token with greedy sampling\n",
    "            next_token_id = np.argmax(logits[0, -1, :])\n",
    "            \n",
    "            # Add token to generated_ids\n",
    "            generated_ids = np.concatenate([generated_ids, np.array([[next_token_id]])], axis=1)\n",
    "            \n",
    "            # Check for end of sequence token\n",
    "            if next_token_id == tokenizer.eos_token_id:\n",
    "                break\n",
    "        \n",
    "        # Get the generated text\n",
    "        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)\n",
    "        \n",
    "        print(f\"Prompt: {prompt}\")\n",
    "        print(f\"Response: {generated_text}\")\n",
    "        print(f\"Generation time: {time.time() - start_time:.2f} seconds\")\n",
    "        \n",
    "    except Exception as e:\n",
    "        print(f\"Error testing Bloom model: {str(e)}\")\n",
    "        import traceback\n",
    "        traceback.print_exc()\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    # Test both quantized and non-quantized versions\n",
    "    test_bloom_model_with_generation(\"quantized\")\n",
    "    test_bloom_model_with_generation(\"non-quantized\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab4bee77",
   "metadata": {},
   "source": [
    "---"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}