litert-community
/

DeepSeek-R1-Distill-Qwen-1.5B

@@ -26,13 +26,27 @@
     {
       "cell_type": "code",
       "source": [
-        "!pip install ai-edge-litert-nightly"
       ],
       "metadata": {
-        "id": "43tAeO0AZ7zp"
       },
-      "execution_count": null,
-      "outputs": []
     },
     {
       "cell_type": "code",
@@ -46,7 +60,7 @@
       "metadata": {
         "id": "i6PMkMVBPr1p"
       },
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -68,7 +82,7 @@
       "metadata": {
         "id": "3t47HAG2tvc3"
       },
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -93,7 +107,7 @@
       "metadata": {
         "id": "Rvdn3EIZhaQn"
       },
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -108,6 +122,7 @@
     {
       "cell_type": "code",
       "source": [
         "\n",
         "class LiteRTLlmPipeline:\n",
         "\n",
@@ -133,7 +148,11 @@
         "    Args:\n",
         "      num_input_tokens: The number of input tokens.\n",
         "    \"\"\"\n",
         "\n",
         "    self._prefill_runner = self._get_prefill_runner(num_input_tokens)\n",
         "    # input_token_shape has shape (batch, max_seq_len)\n",
         "    input_token_shape = self._prefill_runner.get_input_details()[\"tokens\"][\n",
@@ -203,62 +222,127 @@
         "      )\n",
         "    return self._interpreter.get_signature_runner(best_signature)\n",
         "\n",
-        "  def _greedy_sampler(self, logits: np.ndarray) -> int:\n",
-        "    return int(np.argmax(logits))\n",
         "\n",
-        "  def generate(self, prompt: str, max_decode_steps: int | None = None) -> str:\n",
-        "    messages=[{ 'role': 'user', 'content': prompt}]\n",
-        "    token_ids = self._tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)\n",
-        "    # Initialize the prefill runner with the suitable input size.\n",
-        "    self._init_prefill_runner(len(token_ids))\n",
         "\n",
-        "    actual_max_decode_steps = self._max_kv_cache_seq_len - len(token_ids)\n",
-        "    if max_decode_steps is not None:\n",
-        "      actual_max_decode_steps = min(actual_max_decode_steps, max_decode_steps)\n",
         "\n",
         "    input_token_ids = [0] * self._max_seq_len\n",
-        "    input_token_ids[:len(token_ids)] = token_ids\n",
-        "    model_inputs = self._init_kv_cache()\n",
-        "    model_inputs.update({\n",
-        "        \"tokens\": np.asarray([input_token_ids], dtype=np.int32),\n",
-        "        \"input_pos\": np.arange(self._max_seq_len, dtype=np.int32),\n",
         "    })\n",
         "    decode_text = []\n",
-        "    decode_step = 0\n",
-        "    print('Running prefill')\n",
-        "    for step in range(actual_max_decode_steps+1):\n",
-        "      signature_runner = self._prefill_runner if step == 0 else self._decode_runner\n",
-        "      model_outputs = signature_runner(**model_inputs)\n",
-        "      # At prefill stage, output logits has shape (batch=1, seq_size, vocab_size)\n",
-        "      # At decode stage, output logits has shape (batch=1, 1, vocab_size).\n",
-        "      selected_logit = len(token_ids)-1 if step == 0 else 0\n",
-        "      logits = model_outputs.pop(\"logits\")[0][selected_logit]\n",
-        "\n",
-        "      if step == 0:\n",
-        "        print('Running decode')\n",
-        "\n",
-        "      # Decode text output.\n",
         "      next_token = self._greedy_sampler(logits)\n",
         "      if next_token == self._tokenizer.eos_token_id:\n",
         "        break\n",
         "      decode_text.append(self._tokenizer.decode(next_token, skip_special_tokens=False))\n",
         "      print(decode_text[-1], end='', flush=True)\n",
-        "      # The rest of the outputs is the updated kv cache.\n",
-        "      model_inputs = model_outputs\n",
-        "      model_inputs.update({\n",
-        "          \"tokens\": np.array([[next_token]], dtype=np.int32),\n",
-        "          \"input_pos\": np.array([decode_step + len(token_ids)], dtype=np.int32),})\n",
-        "      decode_step += 1\n",
         "\n",
         "\n",
         "\n",
-        "    print() # print a new line at the end.\n",
-        "    return ''.join(decode_text)\n"
       ],
       "metadata": {
         "id": "UBSGrHrM4ANm"
       },
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -279,7 +363,7 @@
       "metadata": {
         "id": "AZhlDQWg61AL"
       },
-      "execution_count": null,
       "outputs": []
     },
     {
@@ -293,15 +377,6 @@
       },
       "execution_count": null,
       "outputs": []
-    },
-    {
-      "cell_type": "code",
-      "source": [],
-      "metadata": {
-        "id": "GNzDBxDFEuAJ"
-      },
-      "execution_count": null,
-      "outputs": []
     }
   ]
 }

     {
       "cell_type": "code",
       "source": [
+        "!pip install ai-edge-litert"
       ],
       "metadata": {
+        "id": "43tAeO0AZ7zp",
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "outputId": "7ce4d1ef-7d6b-4855-b73b-22482e3c693d"
       },
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "name": "stdout",
+          "text": [
+            "Requirement already satisfied: ai-edge-litert in /usr/local/lib/python3.11/dist-packages (1.1.2)\n",
+            "Requirement already satisfied: flatbuffers in /usr/local/lib/python3.11/dist-packages (from ai-edge-litert) (25.2.10)\n",
+            "Requirement already satisfied: numpy>=1.23.2 in /usr/local/lib/python3.11/dist-packages (from ai-edge-litert) (1.26.4)\n"
+          ]
+        }
+      ]
     },
     {
       "cell_type": "code",
       "metadata": {
         "id": "i6PMkMVBPr1p"
       },
+      "execution_count": 2,
       "outputs": []
     },
     {
       "metadata": {
         "id": "3t47HAG2tvc3"
       },
+      "execution_count": 3,
       "outputs": []
     },
     {
       "metadata": {
         "id": "Rvdn3EIZhaQn"
       },
+      "execution_count": 4,
       "outputs": []
     },
     {
     {
       "cell_type": "code",
       "source": [
+        "\n",
         "\n",
         "class LiteRTLlmPipeline:\n",
         "\n",
         "    Args:\n",
         "      num_input_tokens: The number of input tokens.\n",
         "    \"\"\"\n",
+        "    if not self._interpreter:\n",
+        "      raise ValueError(\"Interpreter is not initialized.\")\n",
         "\n",
+        "    # Prefill runner related variables will be initialized in `predict_text` and\n",
+        "    # `compute_log_likelihood`.\n",
         "    self._prefill_runner = self._get_prefill_runner(num_input_tokens)\n",
         "    # input_token_shape has shape (batch, max_seq_len)\n",
         "    input_token_shape = self._prefill_runner.get_input_details()[\"tokens\"][\n",
         "      )\n",
         "    return self._interpreter.get_signature_runner(best_signature)\n",
         "\n",
+        "  def _run_prefill(\n",
+        "      self, prefill_token_ids: Sequence[int],\n",
+        "  ) -> dict[str, np.ndarray]:\n",
+        "    \"\"\"Runs prefill and returns the kv cache.\n",
         "\n",
+        "    Args:\n",
+        "      prefill_token_ids: The token ids of the prefill input.\n",
         "\n",
+        "    Returns:\n",
+        "      The updated kv cache.\n",
+        "    \"\"\"\n",
+        "    if not self._prefill_runner:\n",
+        "      raise ValueError(\"Prefill runner is not initialized.\")\n",
+        "    prefill_token_length = len(prefill_token_ids)\n",
+        "    if prefill_token_length == 0:\n",
+        "      return self._init_kv_cache()\n",
         "\n",
+        "    # Prepare the input to be [1, max_seq_len].\n",
         "    input_token_ids = [0] * self._max_seq_len\n",
+        "    input_token_ids[:prefill_token_length] = prefill_token_ids\n",
+        "    input_token_ids = np.asarray(input_token_ids, dtype=np.int32)\n",
+        "    input_token_ids = np.expand_dims(input_token_ids, axis=0)\n",
+        "\n",
+        "    # Prepare the input position to be [max_seq_len].\n",
+        "    input_pos = [0] * self._max_seq_len\n",
+        "    input_pos[:prefill_token_length] = range(prefill_token_length)\n",
+        "    input_pos = np.asarray(input_pos, dtype=np.int32)\n",
+        "\n",
+        "    # Initialize kv cache.\n",
+        "    prefill_inputs = self._init_kv_cache()\n",
+        "    prefill_inputs.update({\n",
+        "        \"tokens\": input_token_ids,\n",
+        "        \"input_pos\": input_pos,\n",
         "    })\n",
+        "    prefill_outputs = self._prefill_runner(**prefill_inputs)\n",
+        "    if \"logits\" in prefill_outputs:\n",
+        "      # Prefill outputs includes logits and kv cache. We only output kv cache.\n",
+        "      prefill_outputs.pop(\"logits\")\n",
+        "\n",
+        "    return prefill_outputs\n",
+        "\n",
+        "  def _greedy_sampler(self, logits: np.ndarray) -> int:\n",
+        "    return int(np.argmax(logits))\n",
+        "\n",
+        "\n",
+        "  def _run_decode(\n",
+        "      self,\n",
+        "      start_pos: int,\n",
+        "      start_token_id: int,\n",
+        "      kv_cache: dict[str, np.ndarray],\n",
+        "      max_decode_steps: int,\n",
+        "  ) -> str:\n",
+        "    \"\"\"Runs decode and outputs the token ids from greedy sampler.\n",
+        "\n",
+        "    Args:\n",
+        "      start_pos: The position of the first token of the decode input.\n",
+        "      start_token_id: The token id of the first token of the decode input.\n",
+        "      kv_cache: The kv cache from the prefill.\n",
+        "      max_decode_steps: The max decode steps.\n",
+        "\n",
+        "    Returns:\n",
+        "      The token ids from the greedy sampler.\n",
+        "    \"\"\"\n",
+        "    next_pos = start_pos\n",
+        "    next_token = start_token_id\n",
         "    decode_text = []\n",
+        "    decode_inputs = kv_cache\n",
+        "\n",
+        "    for _ in range(max_decode_steps):\n",
+        "      decode_inputs.update({\n",
+        "          \"tokens\": np.array([[next_token]], dtype=np.int32),\n",
+        "          \"input_pos\": np.array([next_pos], dtype=np.int32),\n",
+        "      })\n",
+        "      decode_outputs = self._decode_runner(**decode_inputs)\n",
+        "      # Output logits has shape (batch=1, 1, vocab_size). We only take the first\n",
+        "      # element.\n",
+        "      logits = decode_outputs.pop(\"logits\")[0][0]\n",
         "      next_token = self._greedy_sampler(logits)\n",
         "      if next_token == self._tokenizer.eos_token_id:\n",
         "        break\n",
         "      decode_text.append(self._tokenizer.decode(next_token, skip_special_tokens=False))\n",
         "      print(decode_text[-1], end='', flush=True)\n",
+        "      # Decode outputs includes logits and kv cache. We already poped out\n",
+        "      # logits, so the rest is kv cache. We pass the updated kv cache as input\n",
+        "      # to the next decode step.\n",
+        "      decode_inputs = decode_outputs\n",
+        "      next_pos += 1\n",
+        "\n",
+        "    print() # print a new line at the end.\n",
+        "    return ''.join(decode_text)\n",
         "\n",
+        "  def generate(self, prompt: str, max_decode_steps: int | None = None) -> str:\n",
+        "    messages=[{ 'role': 'user', 'content': prompt}]\n",
+        "    token_ids = self._tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)\n",
+        "    # Initialize the prefill runner with the suitable input size.\n",
+        "    self._init_prefill_runner(len(token_ids))\n",
         "\n",
+        "    # Run prefill.\n",
+        "    # Prefill up to the seond to the last token of the prompt, because the last\n",
+        "    # token of the prompt will be used to bootstrap decode.\n",
+        "    prefill_token_length = len(token_ids) - 1\n",
         "\n",
+        "    print('Running prefill')\n",
+        "    kv_cache = self._run_prefill(token_ids[:prefill_token_length])\n",
+        "    # Run decode.\n",
+        "    print('Running decode')\n",
+        "    actual_max_decode_steps = self._max_kv_cache_seq_len - prefill_token_length - 1\n",
+        "    if max_decode_steps is not None:\n",
+        "      actual_max_decode_steps = min(actual_max_decode_steps, max_decode_steps)\n",
+        "    decode_text = self._run_decode(\n",
+        "        prefill_token_length,\n",
+        "        token_ids[prefill_token_length],\n",
+        "        kv_cache,\n",
+        "        actual_max_decode_steps,\n",
+        "    )\n",
+        "    return decode_text"
       ],
       "metadata": {
         "id": "UBSGrHrM4ANm"
       },
+      "execution_count": 7,
       "outputs": []
     },
     {
       "metadata": {
         "id": "AZhlDQWg61AL"
       },
+      "execution_count": 8,
       "outputs": []
     },
     {
       },
       "execution_count": null,
       "outputs": []
     }
   ]
 }