{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "3bf1d8de", "metadata": {}, "outputs": [], "source": [ "#import numpy as np\n", "#print(np.__version__)" ] }, { "cell_type": "code", "execution_count": 1, "id": "b47ee338", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "E:\\Anaconda\\Lib\\site-packages\\pandas\\core\\arrays\\masked.py:61: UserWarning: Pandas requires version '1.3.6' or newer of 'bottleneck' (version '1.3.5' currently installed).\n", " from pandas.core import (\n" ] }, { "data": { "text/plain": [ "{'Provider': 'Virgin Plus',\n", " 'Plan Name': 'BYOP 40GB + 10GB Bonus (Add-a-line)',\n", " 'Price': 40,\n", " 'Data': '50GB',\n", " 'BYOD': True,\n", " 'Contract': 'No',\n", " 'Hotspot': True,\n", " 'Notes': 'Add-a-line'}" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import load_dataset\n", "\n", "data_files = \"E:/Hugging_Face/telecom_plans.csv\"\n", "plan_data = load_dataset(\"csv\", data_files = data_files, split = \"train\")\n", "plan_data[13]" ] }, { "cell_type": "code", "execution_count": 2, "id": "6cf11fc3", "metadata": {}, "outputs": [], "source": [ "def description(example):\n", " if example['Notes'] == 'New':\n", " customer_note = 'new customers migrating from other service providers'\n", " elif example['Notes'] == 'Add-a-line':\n", " customer_note = f\"adding a line on existing {example['Provider']} account\"\n", " else:\n", " customer_note = 'hardware upgrade customers only'\n", "\n", " return {\n", " \"Description\": (\n", " f\"{example['Provider']} offers a plan \"\n", " f\"{example['Plan Name']} for the price of ${example['Price']} a month, which includes {example['Data']} \"\n", " f\"data with {'BYOD' if example['BYOD'] else 'device financing over 24 months'} and \"\n", " f\"{'allows you to data-share with Hotspot' if example['Hotspot'] else 'does not include data-share with Hotspot'}. \"\n", " f\"This plan is for {customer_note}.\"\n", " )\n", " }\n", "\n", "plan_data_mod = plan_data.map(description)\n", " " ] }, { "cell_type": "code", "execution_count": 3, "id": "167e31ce", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Provider': ['Bell', 'Bell', 'Bell', 'Bell', 'Bell'],\n", " 'Plan Name': ['Elite Lite (New)',\n", " 'Elite Lite (Add-a-line)',\n", " 'Elite Lite (HUG)',\n", " 'ExtraElite (New)',\n", " 'ExtraElite (Add-a-line)'],\n", " 'Price': [60, 60, 60, 45, 45],\n", " 'Data': ['60GB', '60GB', '60GB', '50GB', '50GB'],\n", " 'BYOD': [False, False, False, False, False],\n", " 'Contract': ['SmartPay', 'SmartPay', 'SmartPay', 'SmartPay', 'SmartPay'],\n", " 'Hotspot': [True, True, True, True, True],\n", " 'Notes': ['New', 'Add-a-line', 'HUG', 'New', 'Add-a-line'],\n", " 'Description': ['Bell offers a plan Elite Lite (New) for the price of $60 a month, which includes 60GB data with device financing over 24 months and allows you to data-share with Hotspot. This plan is for new customers migrating from other service providers.',\n", " 'Bell offers a plan Elite Lite (Add-a-line) for the price of $60 a month, which includes 60GB data with device financing over 24 months and allows you to data-share with Hotspot. This plan is for adding a line on existing Bell account.',\n", " 'Bell offers a plan Elite Lite (HUG) for the price of $60 a month, which includes 60GB data with device financing over 24 months and allows you to data-share with Hotspot. This plan is for hardware upgrade customers only.',\n", " 'Bell offers a plan ExtraElite (New) for the price of $45 a month, which includes 50GB data with device financing over 24 months and allows you to data-share with Hotspot. This plan is for new customers migrating from other service providers.',\n", " 'Bell offers a plan ExtraElite (Add-a-line) for the price of $45 a month, which includes 50GB data with device financing over 24 months and allows you to data-share with Hotspot. This plan is for adding a line on existing Bell account.']}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "plan_data_mod[:5]" ] }, { "cell_type": "code", "execution_count": 4, "id": "9664ba16", "metadata": {}, "outputs": [], "source": [ "#!pip uninstall urllib3 -y\n", "#!pip install urllib3==1.26.18" ] }, { "cell_type": "code", "execution_count": 5, "id": "82bcbeea", "metadata": {}, "outputs": [], "source": [ "#!pip install --upgrade accelerate transformers" ] }, { "cell_type": "code", "execution_count": 6, "id": "a7a41116", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "WARNING:tensorflow:From E:\\Anaconda\\Lib\\site-packages\\keras\\src\\losses.py:2976: The name tf.losses.sparse_softmax_cross_entropy is deprecated. Please use tf.compat.v1.losses.sparse_softmax_cross_entropy instead.\n", "\n" ] } ], "source": [ "from sentence_transformers import SentenceTransformer\n", "\n", "embedder = SentenceTransformer(\"all-MiniLM-L6-v2\")\n", "\n", "descriptions = plan_data_mod[\"Description\"]\n", "embeddings = embedder.encode(descriptions, convert_to_numpy = True)" ] }, { "cell_type": "code", "execution_count": 7, "id": "e5742a93", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[-0.04481903, -0.06913085, 0.02706613, ..., -0.08726593,\n", " -0.05549479, 0.00731448],\n", " [-0.05391502, -0.07286713, -0.00130842, ..., -0.07893781,\n", " -0.05440836, -0.02790258],\n", " [-0.0552901 , -0.04580358, 0.04743244, ..., -0.10154203,\n", " -0.05256264, 0.02625924],\n", " ...,\n", " [ 0.00983924, 0.01232204, 0.03798797, ..., -0.080321 ,\n", " -0.1193777 , 0.00596426],\n", " [ 0.01246224, 0.01364668, 0.04105236, ..., -0.07725951,\n", " -0.13012475, 0.01362491],\n", " [ 0.01581116, 0.0131523 , 0.04183232, ..., -0.07409089,\n", " -0.12856905, 0.01441113]], dtype=float32)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "embeddings" ] }, { "cell_type": "code", "execution_count": 8, "id": "3aaf4323", "metadata": {}, "outputs": [], "source": [ "import faiss, numpy as np\n", "\n", "X = np.asarray(embeddings, dtype=\"float32\")\n", "faiss.normalize_L2(X)\n", "\n", "index = faiss.IndexFlatIP(X.shape[1])\n", "index.add(X) " ] }, { "cell_type": "code", "execution_count": 9, "id": "b39fda8e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ " >" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "index" ] }, { "cell_type": "code", "execution_count": 10, "id": "f933cfd5", "metadata": {}, "outputs": [], "source": [ "def retrieve(query, k=5):\n", " q = embedder.encode([query]).astype(\"float32\")\n", " faiss.normalize_L2(q)\n", " scores, idx = index.search(q, k) # scores in [0,1] ~ cosine\n", " return [(int(i), float(s)) for i, s in zip(idx[0], scores[0])]\n" ] }, { "cell_type": "code", "execution_count": 11, "id": "34a49b4c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(65, 0.7283002138137817), (59, 0.7259033918380737), (67, 0.7257879376411438), (9, 0.725434422492981), (68, 0.725420355796814)]\n" ] } ], "source": [ "ex_1 = \"what are the best BYOD plans in Virgin plus below 60$\"\n", "\n", "pair_1 = retrieve(ex_1)\n", "print(pair_1)" ] }, { "cell_type": "code", "execution_count": 12, "id": "1028f1cd", "metadata": {}, "outputs": [], "source": [ "#from huggingface_hub import notebook_login\n", "\n", "#notebook_login()" ] }, { "cell_type": "code", "execution_count": 13, "id": "7aad76c2", "metadata": {}, "outputs": [], "source": [ "#pip install hf_xet" ] }, { "cell_type": "code", "execution_count": 14, "id": "d9164db6", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "23f5ceb7f749450c930c69dd9e60a212", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0.00B [00:00, ?B/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "E:\\Anaconda\\Lib\\site-packages\\huggingface_hub\\file_download.py:143: UserWarning: `huggingface_hub` cache-system uses symlinks by default to efficiently store duplicated files but your machine does not support them in C:\\Users\\kompe\\.cache\\huggingface\\hub\\models--MBZUAI--LaMini-Flan-T5-783M. Caching files will still work but in a degraded version that might require more space on your disk. This warning can be disabled by setting the `HF_HUB_DISABLE_SYMLINKS_WARNING` environment variable. For more details, see https://huggingface.co/docs/huggingface_hub/how-to-cache#limitations.\n", "To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development\n", " warnings.warn(message)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "b01a872f4fb241f6a7a18e75972695ca", "version_major": 2, "version_minor": 0 }, "text/plain": [ "spiece.model: 0%| | 0.00/792k [00:00 transformers.pipelines.base.Pipeline>" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"MBZUAI/LaMini-Flan-T5-783M\")\n", "model = AutoModelForSeq2SeqLM.from_pretrained(\"MBZUAI/LaMini-Flan-T5-783M\")\n", "qa_pipeline = pipeline(\"text2text-generation\",model = model, tokenizer = tokenizer)\n", "pipeline" ] }, { "cell_type": "code", "execution_count": 15, "id": "7c939d25", "metadata": {}, "outputs": [], "source": [ "def answer_question(query):\n", " top_k = retrieve(query, k=5)\n", " context = \"\\n\".join([plan_data_mod[i][\"Description\"] for i, _ in top_k])\n", " prompt = (\n", " f\"You are a telecom assistant helping a user choose the best phone plan. \"\n", " f\"Only include plans that match the question clearly. Be concise and friendly.\\n\\n\"\n", " f\"Plans:\\n{context}\\n\\n\"\n", " f\"Question: {query}\\n\"\n", " f\"Answer in full sentences:\"\n", ")\n", "\n", " response = qa_pipeline(prompt, max_new_tokens=100, do_sample=False)\n", " raw_answer = response[0]['generated_text']\n", "\n", " lines = raw_answer.strip().split('\\n')\n", " unique_lines = list(dict.fromkeys(lines))\n", " cleaned_answer = \"\\n\".join(unique_lines)\n", "\n", " return cleaned_answer\n" ] }, { "cell_type": "code", "execution_count": 19, "id": "8e0b5b11", "metadata": {}, "outputs": [], "source": [ "import evaluate\n", "rouge = evaluate.load(\"rouge\")\n", "bleu = evaluate.load(\"bleu\")\n", "bertscore = evaluate.load(\"bertscore\")\n", "\n", "def eval_metrics(prediction: str, reference: str):\n", " bleu_res = bleu.compute(predictions=[prediction], references=[reference])\n", " rouge_res = rouge.compute(predictions=[prediction], references=[reference])\n", " bert_res = bertscore.compute(predictions=[prediction], references=[reference], lang=\"en\")\n", " return {\n", " \"BLEU\": bleu_res[\"bleu\"],\n", " \"ROUGE-1\": rouge_res[\"rouge1\"],\n", " \"ROUGE-2\": rouge_res[\"rouge2\"],\n", " \"ROUGE-L\": rouge_res[\"rougeL\"],\n", " \"BERTScore-F1\": float(sum(bert_res[\"f1\"]) / len(bert_res[\"f1\"]))\n", " }" ] }, { "cell_type": "code", "execution_count": 20, "id": "43a670d4", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Ask your plan-related question: bell student plan in alberta?\n", "\n", "Answer:\n", " Bell offers three student plans in Alberta: Elite Lite for $52 a month, Elite Lite for $35 a month, ExtraElite+ for $55 a month, and ExtraElite for $45 a month. These plans are for hardware upgrade customers only and include unlimited data with device financing over 24 months and data-sharing with Hotspot.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "84315c6640e848598284ac5f4ea5f0fb", "version_major": 2, "version_minor": 0 }, "text/plain": [ "tokenizer_config.json: 0%| | 0.00/25.0 [00:00