Digital_Detectives_RAG_System

Sleeping

App Files Files Community

GeorgiosIoannouCoder commited on Nov 5, 2024

Commit

77cd2ef

verified ·

1 Parent(s): c924850

Create Fall_2024_Ioannou_Georgios_RAG_tutorial_11_05_2024.ipynb

Browse files

Files changed (1) hide show

Fall_2024_Ioannou_Georgios_RAG_tutorial_11_05_2024.ipynb +677 -0

Fall_2024_Ioannou_Georgios_RAG_tutorial_11_05_2024.ipynb ADDED Viewed

	@@ -0,0 +1,677 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NgfYnPJIcitW"
+      },
+      "source": [
+        "---\n",
+        "\n",
+        "# Ioannou_Georgios\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "BAdncZ1Ccmn_"
+      },
+      "source": [
+        "## Copyright © 2024 by Georgios Ioannou\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vxYZpoi3dgfL"
+      },
+      "source": [
+        "---\n",
+        "\n",
+        "<h1 align=\"center\"> RAG Question Answering Application Using TXT Files, MongoDB As The Vector Database, HuggingFace Embedding Model, HuggingFace LLM, and Gradio </h1>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "<h2 align=\"center\"> HuggingFace Embedding Model Used: <a href=\"https://huggingface.co/sentence-transformers/all-mpnet-base-v2\"> all-mpnet-base-v2 </a> </h2>\n"
+      ],
+      "metadata": {
+        "id": "MWzJLfMsCrqt"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "<h2 align=\"center\"> HuggingFace LLM Model Used: <a href=\"https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct\"> Qwen2.5-1.5B-Instruct </a> </h2>\n"
+      ],
+      "metadata": {
+        "id": "0l8WK80uC9WZ"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xXdDSfrtzW10"
+      },
+      "source": [
+        "---\n",
+        "\n",
+        "<h2 align=\"center\"> Install Libraries </h2>\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "wEoN-qN9cxjt"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install gradio pymongo langchain-community transformers"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "oXSlapLqeXoJ"
+      },
+      "outputs": [],
+      "source": [
+        "# Import libraries.\n",
+        "# Gradio.\n",
+        "import gradio as gr\n",
+        "\n",
+        "# File loading and environment variables.\n",
+        "import os\n",
+        "import sys\n",
+        "\n",
+        "# File loading and environment variables.\n",
+        "from getpass import getpass\n",
+        "from google.colab import userdata\n",
+        "from google.colab import drive\n",
+        "\n",
+        "# Gradio.\n",
+        "from gradio.themes.base import Base\n",
+        "\n",
+        "# HuggingFace LLM.\n",
+        "from huggingface_hub import InferenceClient\n",
+        "\n",
+        "# Langchain.\n",
+        "from langchain.document_loaders import TextLoader\n",
+        "from langchain.prompts import PromptTemplate\n",
+        "from langchain.schema.runnable import RunnablePassthrough, RunnableLambda\n",
+        "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+        "from langchain_community.vectorstores import MongoDBAtlasVectorSearch\n",
+        "from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
+        "\n",
+        "# MongoDB.\n",
+        "from pymongo import MongoClient\n",
+        "\n",
+        "# Function type hints.\n",
+        "from typing import Dict, Any"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qNMAqdpWf5Iq"
+      },
+      "source": [
+        "## Step 1: Data Sourcing and Preparation\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PRKmpcMWjXeg"
+      },
+      "outputs": [],
+      "source": [
+        "# For Google Colab.\n",
+        "# Mount (connect) your Google Drive to your Colab environment.\n",
+        "# This will establish a connection to your Google Drive, making it accessible from your Colab notebook.\n",
+        "\n",
+        "drive.mount(\"/content/drive/\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "V_YnoLTkjXek"
+      },
+      "outputs": [],
+      "source": [
+        "# For Google Colab.\n",
+        "! ls \"/content/drive/My Drive/zoom-transcripts/\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qGXN8pAWjXen"
+      },
+      "outputs": [],
+      "source": [
+        "# For Google Colab.\n",
+        "# Append your directory path to the Python system path.\n",
+        "directory_path = \"/content/drive/My Drive/zoom-transcripts/\"\n",
+        "\n",
+        "sys.path.append(directory_path)\n",
+        "\n",
+        "# Print the updated system path to the console.\n",
+        "print(\"sys.path =\", sys.path)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "xwnzuw0NjXeq"
+      },
+      "outputs": [],
+      "source": [
+        "# Get all the filenames under our directory path.\n",
+        "my_txts = os.listdir(directory_path)\n",
+        "my_txts"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ggS61lmnjXer"
+      },
+      "outputs": [],
+      "source": [
+        "# Load the TXT.\n",
+        "\n",
+        "loaders = []\n",
+        "for my_txt in my_txts:\n",
+        "    my_txt_path = os.path.join(directory_path, my_txt)\n",
+        "    loaders.append(TextLoader(my_txt_path))\n",
+        "\n",
+        "print(\"len(loaders) =\", len(loaders))\n",
+        "\n",
+        "loaders"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "H9g8SGTGjXes"
+      },
+      "outputs": [],
+      "source": [
+        "# Load the TXT.\n",
+        "\n",
+        "data = []\n",
+        "for loader in loaders:\n",
+        "    data.append(loader.load())\n",
+        "\n",
+        "print(\"len(data) =\", len(data), \"\\n\")\n",
+        "\n",
+        "# First TXT file.\n",
+        "data[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "SSZOD3M8jXey"
+      },
+      "outputs": [],
+      "source": [
+        "# Initialize the text splitter\n",
+        "# Uses a text splitter to split the data into smaller documents.\n",
+        "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "fAqdCPx8jXez"
+      },
+      "outputs": [],
+      "source": [
+        "# Split the TXT documents into chunks.\n",
+        "docs = []\n",
+        "for doc in data:\n",
+        "    chunk = text_splitter.split_documents(doc)\n",
+        "    docs.append(chunk)\n",
+        "\n",
+        "# # Debugging purposes to print the number of documents in each chunk.\n",
+        "# # Print the number of total documents to be stored in the vector database.\n",
+        "# total = 0\n",
+        "# for i in range(len(docs)):\n",
+        "#     if i == len(docs) - 1:\n",
+        "#         print(len(docs[i]), end=\"\")\n",
+        "#     else:\n",
+        "#         print(len(docs[i]), \"+ \", end=\"\")\n",
+        "#     total += len(docs[i])\n",
+        "# print(\" =\", total, \" total documents\\n\")\n",
+        "\n",
+        "# # Print the first document.\n",
+        "# print(docs[0], \"\\n\\n\\n\")\n",
+        "\n",
+        "# # Print the total number of TXT files.\n",
+        "# # docs is a list of lists where each list stores all the documents for one TXT file.\n",
+        "# print(len(docs), \"chunks in docs list\")\n",
+        "\n",
+        "# docs"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "CRdD2CQXjXe0"
+      },
+      "outputs": [],
+      "source": [
+        "# Merge the documents into a single list to be embededed so that they can be stored them in the vector database.\n",
+        "merged_documents = []\n",
+        "\n",
+        "for doc in docs:\n",
+        "    merged_documents.extend(doc)\n",
+        "\n",
+        "# Print the merged list of all the documents.\n",
+        "print(\"len(merged_documents) =\", len(merged_documents))\n",
+        "print(merged_documents)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Step 2: Vector Database Setup\n"
+      ],
+      "metadata": {
+        "id": "amLFTvEUrYHR"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Connect to MongoDB Atlas cluster using the connection string.\n",
+        "MONGO_URI = getpass(\"MONGO_URI:\")\n",
+        "cluster = MongoClient(MONGO_URI)\n",
+        "\n",
+        "# Define the MongoDB database and collection name.\n",
+        "DB_NAME = \"txts\"\n",
+        "COLLECTION_NAME = \"txts_collection\"\n",
+        "\n",
+        "# Connect to the specific collection in the database.\n",
+        "MONGODB_COLLECTION = cluster[DB_NAME][COLLECTION_NAME]\n",
+        "\n",
+        "vector_search_index = \"vector_index\""
+      ],
+      "metadata": {
+        "id": "vcYEWk7Dnoz_"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Delete any existing records in the collection.\n",
+        "# Clear the collection.\n",
+        "MONGODB_COLLECTION.delete_many({})"
+      ],
+      "metadata": {
+        "id": "wEabWPjmnrWc"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Step 3: Generate Embeddings and Data Ingestion Into MongoDB"
+      ],
+      "metadata": {
+        "id": "poyFalAIrd3g"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "HF_TOKEN = getpass(\"HF_TOKEN:\")\n",
+        "# https://python.langchain.com/docs/integrations/text_embedding/huggingfacehub/#hugging-face-inference-api\n",
+        "embedding_model = HuggingFaceInferenceAPIEmbeddings(\n",
+        "    api_key=HF_TOKEN, model_name=\"sentence-transformers/all-mpnet-base-v2\"\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "qBMpZjK_rrSi"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Initialize the MongoDB Atlas vector search with the document segments.\n",
+        "# Create a vector store (vecgtor database) from the  documents.\n",
+        "vector_search = MongoDBAtlasVectorSearch.from_documents(\n",
+        "    documents=merged_documents, # The sample documents to store in the vector database.\n",
+        "    embedding=embedding_model, # HuggingFace's embedding model as the model used to convert text into vector embeddings for the embedding field.\n",
+        "    collection=MONGODB_COLLECTION, # pdfs.pdfs_collection as the Atlas collection to store the documents.\n",
+        "    index_name=vector_search_index # vector_index as the index to use for querying the vector store.\n",
+        ")\n",
+        "\n",
+        "# At this point, 'docs' are split and indexed in MongoDB Atlas, enabling text search capabilities."
+      ],
+      "metadata": {
+        "id": "fVEp3QfHnc3l"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Connect to an existing vector store (database).\n",
+        "# ONLY RUN IT IF YOU HAVE AN EXISITNG VECTOR STORE AND YOU JUST NEED TO CONNECT TO IT.\n",
+        "vector_search = MongoDBAtlasVectorSearch.from_connection_string(\n",
+        "    connection_string=MONGO_URI,\n",
+        "    namespace=f\"{DB_NAME}.{COLLECTION_NAME}\",\n",
+        "    embedding=embedding_model,\n",
+        "    index_name=vector_search_index,\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "kj8wfTv38zAG"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "DGlA_MrqpSkQ"
+      },
+      "source": [
+        "## Step 4: Vector Search\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Semantic Search.\n",
+        "query = \"Who is Georgios?\"\n",
+        "results = vector_search.similarity_search(query=query, k=10) # 10 most similar documents.\n",
+        "\n",
+        "print(\"\\n\")\n",
+        "print(results)\n",
+        "# # Better looking output.\n",
+        "# from pprint import pprint\n",
+        "# pprint(results)"
+      ],
+      "metadata": {
+        "id": "K4-w_1Q7r85B"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Filter on metadata.\n",
+        "# Semantic search with filtering.\n",
+        "query = \"Who is Georgios?\"\n",
+        "\n",
+        "results = vector_search.similarity_search_with_score(\n",
+        "   query = query,\n",
+        "   k = 10, # 10 most similar documents.\n",
+        "   pre_filter = { \"source\": { \"$eq\": \"/content/drive/My Drive/zoom-transcripts/Week-01-Setup-Pandas-Tuesday-2024-08-27.vtt\" } } # Filtering on the source.\n",
+        ")\n",
+        "\n",
+        "print(results)\n",
+        "# # Better looking output.\n",
+        "# from pprint import pprint\n",
+        "# pprint(results)"
+      ],
+      "metadata": {
+        "id": "rxEssV0uuKNk"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Basic RAG.\n",
+        "# k to search for only the 10 most relevant documents.\n",
+        "# score_threshold to use only documents with a relevance score above 0.80.\n",
+        "retriever_1 = vector_search.as_retriever(\n",
+        "   search_type = \"similarity\", # similarity, mmr, similarity_score_threshold. https://api.python.langchain.com/en/latest/vectorstores/langchain_core.vectorstores.VectorStore.html#langchain_core.vectorstores.VectorStore.as_retriever\n",
+        "   search_kwargs = {\"k\": 10, \"score_threshold\": 0.85}\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "f0GIVNFpuQnP"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# RAG with Filtering.\n",
+        "# k to search for only the 10 most relevant documents.\n",
+        "# score_threshold to use only documents with a relevance score above 0.89.\n",
+        "# pre_filter to filter documents where the source is equal to \"/content/drive/My Drive/zoom-transcripts/Week-01-Setup-Pandas-Tuesday-2024-08-27.vtt\".\n",
+        "retriever_2 = vector_search.as_retriever(\n",
+        "   search_type = \"similarity\", # similarity, mmr, similarity_score_threshold. https://api.python.langchain.com/en/latest/vectorstores/langchain_core.vectorstores.VectorStore.html#langchain_core.vectorstores.VectorStore.as_retriever\n",
+        "   search_kwargs = {\n",
+        "      \"k\": 10,\n",
+        "      \"score_threshold\": 0.89,\n",
+        "      \"pre_filter\": { \"source\": { \"$eq\": \"/content/drive/My Drive/zoom-transcripts/Week-01-Setup-Pandas-Tuesday-2024-08-27.vtt\" } }\n",
+        "   }\n",
+        ")"
+      ],
+      "metadata": {
+        "id": "E0GMWmBqxK6D"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NBS7TGJoE-tb"
+      },
+      "source": [
+        "## Step 5: LLM\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Formatting the retrieved documents beofre inserting them in the system prompt template.\n",
+        "def format_docs(docs):\n",
+        "    return \"\\n\\n\".join(doc.page_content for doc in docs)\n",
+        "\n",
+        "def generate_response(input_dict: Dict[str, Any]) -> str:\n",
+        "    formatted_prompt = prompt.format(**input_dict)\n",
+        "    # print(formatted_prompt)\n",
+        "    response = hf_client.chat.completions.create(\n",
+        "        model=\"Qwen/Qwen2.5-1.5B-Instruct\",\n",
+        "        messages=[{\n",
+        "            \"role\": \"system\",\n",
+        "            \"content\": formatted_prompt\n",
+        "        },{\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": input_dict[\"question\"]\n",
+        "        }],\n",
+        "        max_tokens=1000,\n",
+        "        temperature=0.2,\n",
+        "    )\n",
+        "\n",
+        "    return response.choices[0].message.content\n",
+        "\n",
+        "# Initialize Hugging Face client\n",
+        "hf_client = InferenceClient(api_key=HF_TOKEN)\n",
+        "\n",
+        "# Define the prompt template\n",
+        "prompt = PromptTemplate.from_template(\n",
+        "    \"\"\"Use the following pieces of context to answer the question at the end.\n",
+        "\n",
+        "    START OF CONTEXT:\n",
+        "    {context}\n",
+        "    END OF CONTEXT:\n",
+        "\n",
+        "    START OF QUESTION:\n",
+        "    {question}\n",
+        "    END OF QUESTION:\n",
+        "\n",
+        "    If you do not know the answer, just say that you do not know.\n",
+        "    NEVER assume things.\n",
+        "    \"\"\"\n",
+        ")\n",
+        "\n",
+        "# Build the chain with retriever_1.\n",
+        "rag_chain = (\n",
+        "    {\"context\": retriever_1 | RunnableLambda(format_docs), \"question\": RunnablePassthrough()}\n",
+        "    | RunnableLambda(generate_response)\n",
+        ")\n",
+        "\n",
+        "# Example usage.\n",
+        "query = \"Who is Georgios?\"\n",
+        "answer = rag_chain.invoke(query)\n",
+        "\n",
+        "print(\"\\nQuestion:\", query)\n",
+        "print(\"Answer:\", answer)\n",
+        "\n",
+        "# Get source documents related to the query.\n",
+        "documents = retriever_1.invoke(query)\n",
+        "print(\"\\nSource documents:\")\n",
+        "# Better looking output.\n",
+        "from pprint import pprint\n",
+        "pprint(results)"
+      ],
+      "metadata": {
+        "id": "fTTCikzK4-Ct"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# # For debugging purposes to look into the chain more in-depth.\n",
+        "# from langchain_core.tracers.stdout import ConsoleCallbackHandler\n",
+        "# answer = rag_chain.invoke(query, config={'callbacks': [ConsoleCallbackHandler()]})"
+      ],
+      "metadata": {
+        "id": "eGvev04J7yUJ"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Does the LLM already has the knowledge or not?\n",
+        "client = InferenceClient(api_key=HF_TOKEN )\n",
+        "\n",
+        "messages = [\n",
+        "\t{\n",
+        "\t\t\"role\": \"user\",\n",
+        "\t\t\"content\": \"Who is Harpreet?\"\n",
+        "\t}\n",
+        "]\n",
+        "\n",
+        "stream = client.chat.completions.create(\n",
+        "  model=\"Qwen/Qwen2.5-1.5B-Instruct\",\n",
+        "\tmessages=messages,\n",
+        "\tmax_tokens=500,\n",
+        "\tstream=True\n",
+        ")\n",
+        "\n",
+        "for chunk in stream:\n",
+        "    print(chunk.choices[0].delta.content, end=\"\")"
+      ],
+      "metadata": {
+        "id": "xHkgODNYOyjW"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Step 5: Gradio\n"
+      ],
+      "metadata": {
+        "id": "7NFmu95wH1rP"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Input : query.\n",
+        "# Output: answer.\n",
+        "\n",
+        "def get_response(query):\n",
+        "  return rag_chain.invoke(query)"
+      ],
+      "metadata": {
+        "id": "e2d4id4tH3MW"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "# Gradio application.\n",
+        "with gr.Blocks(theme=Base(), title=\"RAG Question Answering App Using .txt Files, MongoDB Vector Database, HuggingFace, and Gradio\") as demo:\n",
+        "    gr.Markdown(\n",
+        "        \"\"\"\n",
+        "        # RAG Question Answering App Using .txt Files, MongoDB Vector Database, HuggingFace, and Gradio\n",
+        "        \"\"\")\n",
+        "    textbox = gr.Textbox(label=\"Question:\")\n",
+        "    with gr.Row():\n",
+        "        button = gr.Button(\"Submit\", variant=\"primary\")\n",
+        "    with gr.Column():\n",
+        "        output1 = gr.Textbox(lines=1, max_lines=10, label=\"Answer:\")\n",
+        "\n",
+        "\n",
+        "# Call get_response function upon clicking the Submit button.\n",
+        "    button.click(get_response, textbox, outputs=[output1])\n",
+        "\n",
+        "demo.launch(share=True)"
+      ],
+      "metadata": {
+        "id": "MMbeOhixICrw"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}