Spaces:

OnurKerimoglu
/

rag_chat

Sleeping

App Files Files Community

OnurKerimoglu commited on May 26, 2025

Commit

46dfa3e

1 Parent(s): 2550c52

rag: use mdoel Mistral-Nemo-Base-2407 instead of zephyr; various minor fixes; added docstrings to the class methods

Browse files

Files changed (1) hide show

src/rag.py +97 -24

src/rag.py CHANGED Viewed

@@ -1,18 +1,14 @@
 import dotenv
-import os
-from langchain_community.document_loaders import UnstructuredURLLoader, PyPDFLoader
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain_community.vectorstores import FAISS
 from langchain_openai import ChatOpenAI
-# from langchain_community.llms import HuggingFaceHub
-from langchain_huggingface import HuggingFaceEndpoint
 from langchain.chains import RetrievalQA
 from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
-from tqdm import tqdm
 class RAG():
     def __init__(
@@ -27,7 +23,8 @@ class RAG():
         # Constants
         # self.use_model = 'gpt-4o-mini'
-        self.use_model = 'zephyr-7b-alpha'
         # self.use_vectordb = 'chroma'
         self.use_vectordb = 'faiss'
@@ -42,9 +39,17 @@ class RAG():
         self.QAbot = None
         # Setup the bots
-        self.setup_rag_bots()
     def load_data(self, urls, pdfs):
         documents = []
         if urls:
             url_loader = UnstructuredURLLoader(urls=urls)
@@ -54,9 +59,15 @@ class RAG():
             documents.extend(pdf_loader.load())
         return documents
-    def sources_to_texts(self, urls, pdfs):
-        documents = self.load_data(urls, pdfs)
         # Retrieval system
         chunk_size = 1000
@@ -75,7 +86,14 @@ class RAG():
         return embeddings
     def create_retriever(self, texts, embeddings):
-        # Create embeddings and vector store
         if self.use_vectordb == 'chroma':
             print ('Creating vectore store with Chroma')
             vectorstore = Chroma.from_documents(texts, embeddings)
@@ -87,23 +105,59 @@ class RAG():
         return retriever
     def create_llm(self):
-        # Create the language model
         if self.use_model == 'gpt-4o-mini':
             print(f'As llm, using OpenAI model: {self.use_model}')
             llm = ChatOpenAI(
                 model_name="gpt-4o-mini",
                 temperature=0)
-        elif self.use_model ==  'zephyr-7b-alpha':
-            print(f'As llm, using HF-Endpint: {self.use_model}')
             llm = HuggingFaceEndpoint(
-                repo_id=f"huggingfaceh4/{self.use_model}",
                 temperature=0.1,
-                max_new_tokens=512
-                )
         return llm
     def create_QAbot(self, retriever, llm):
-        # Create a QAbot
         # System prompt and prompt template
         system_template = """You are an AI assistant that answers questions based on the given context.
         Your responses should be informative and relevant to the question asked.
@@ -128,9 +182,20 @@ class RAG():
         )
         return QAbot
-    def setup_rag_bots(self):
         # Initial data
-        texts = self.sources_to_texts(self.urls, self.pdfs)
         # Create embeddings
         embeddings = self.create_embeddings()
         # Create the retriever
@@ -144,6 +209,14 @@ class RAG():
         )
     def ask_QAbot(self, question):
         result = self.QAbot.invoke({"query": question})
         sources = [doc.metadata.get('source', 'Unknown source') for doc in result["source_documents"]]
         response = {
@@ -159,8 +232,8 @@ if __name__ == "__main__":
         urls = [
             "https://en.wikipedia.org/wiki/Artificial_intelligence",
             "https://en.wikipedia.org/wiki/Machine_learning"
-        ],
-        pdfs = ["/home/onur/WORK/DS/repos/chat_with_docs/docs/the-big-book-of-mlops-v10-072023 - Databricks.pdf"]
     )
     response = rag.ask_QAbot("What is Machine Learning?")
     print(f"Question: {response['question']}")

 import dotenv
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import UnstructuredURLLoader, PyPDFLoader
 from langchain_community.vectorstores import Chroma
 from langchain_community.vectorstores import FAISS
 from langchain_openai import ChatOpenAI
+from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings
 from langchain.chains import RetrievalQA
 from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
 class RAG():
     def __init__(
         # Constants
         # self.use_model = 'gpt-4o-mini'
+        # self.use_model = 'zephyr-7b-alpha'
+        self.use_model = 'Mistral-Nemo-Base-2407'
         # self.use_vectordb = 'chroma'
         self.use_vectordb = 'faiss'
         self.QAbot = None
         # Setup the bots
+        self.setup_rag_bot()
     def load_data(self, urls, pdfs):
+        """
+        Loads data from the input URLs and PDFs.
+        Args:
+            urls: List of URLs to load.
+            pdfs: List of PDF files to load.
+        Returns:
+            A list of Document objects loaded from the input URLs and PDFs.
+        """
         documents = []
         if urls:
             url_loader = UnstructuredURLLoader(urls=urls)
             documents.extend(pdf_loader.load())
         return documents
+    def sources_to_texts(self, documents):
+        """
+        Takes a list of URLs and PDFs and converts them into a list of text chunks.
+        The text chunks are split into chunks of a certain size with a certain amount of overlap.
+        Args:
+            documents: a list of document objects loaded from the input data
+        Returns:
+            A list of text chunks.
+        """
         # Retrieval system
         chunk_size = 1000
         return embeddings
     def create_retriever(self, texts, embeddings):
+        """
+        Creates a retriever from the given texts and embeddings.
+        Args:
+            texts: A list of text strings to encode in the vector store.
+            embeddings: An instance of langchain.Embeddings to use for encoding the texts.
+        Returns:
+            An instance of langchain.Retriever.
+        """
         if self.use_vectordb == 'chroma':
             print ('Creating vectore store with Chroma')
             vectorstore = Chroma.from_documents(texts, embeddings)
         return retriever
     def create_llm(self):
+        """
+        Instantiates a language model based on the specified model type.
+        This function supports two models:
+        - 'gpt-4o-mini' through the ChatOpenAI interface
+        - 'Mistral-Nemo-Base-2407' through the HuggingFaceEndpoint, with provider: novita
+        ('zephyr-7b-alpha' through the HuggingFaceEndpoint is being tested, but not working at the moment)
+        The model is determined by the `self.use_model` attribute.
+        Returns an instance of the selected language model.
+        Returns:
+            llm: An instance of the chosen language model, either ChatOpenAI or HuggingFaceEndpoint.
+        """
         if self.use_model == 'gpt-4o-mini':
             print(f'As llm, using OpenAI model: {self.use_model}')
             llm = ChatOpenAI(
                 model_name="gpt-4o-mini",
                 temperature=0)
+        # elif self.use_model ==  'zephyr-7b-alpha':
+        #     print(f'As llm, using HF-Endpint: {self.use_model}')
+        #     llm = HuggingFaceEndpoint(
+        #         repo_id=f"HuggingFaceH4/{self.use_model}",
+        #         temperature=0.1,
+        #         max_new_tokens=512,
+        #         do_sample=False
+        #         )
+        elif self.use_model ==  'Mistral-Nemo-Base-2407':
+            provider = "novita"
+            print(f'As llm, using HF-Endpint: {self.use_model} through provider: {provider}')
             llm = HuggingFaceEndpoint(
+                repo_id="mistralai/Mistral-Nemo-Base-2407",
+                provider=provider,
                 temperature=0.1,
+                max_new_tokens=512,
+                do_sample=False
+            )
         return llm
     def create_QAbot(self, retriever, llm):
+        """
+        Creates a QAbot (Question-Answering bot) from the given retriever and language model.
+        The QAbot is a type of RetrievalQA chain built with Langchain that, for a given question:
+        - uses the given retriever to get the relevant documents
+        - and the given language model to generate an answer.
+        Args:
+            retriever: An instance of langchain.Retriever.
+            llm: An instance of langchain.LLM.
+        Returns:
+            QAbot: An instance of langchain.RetrievalQA.
+        """
         # System prompt and prompt template
         system_template = """You are an AI assistant that answers questions based on the given context.
         Your responses should be informative and relevant to the question asked.
         )
         return QAbot
+    def setup_rag_bot(self):
+        """
+        Sets up the RAG bot by:
+        - loading the data from the input URLs and PDFs
+        - splitting the data into chunks of text
+        - creating embeddings for the text chunks
+        - creating a retriever using the embeddings
+        - creating a language model and prompts
+        - and creating a QA bot (Question-Answering bot) using the retriever and language model.
+        """
         # Initial data
+        documents = self.load_data(self.urls, self.pdfs)
+        texts = self.sources_to_texts(documents)
         # Create embeddings
         embeddings = self.create_embeddings()
         # Create the retriever
         )
     def ask_QAbot(self, question):
+        """
+        Queries the QA bot with a specified question and retrieves the answer along with the sources.
+        Args:
+            question (str): The question to be asked to the QA bot.
+        Returns:
+            dict: A dictionary containing the question, answer, and sources.
+        """
         result = self.QAbot.invoke({"query": question})
         sources = [doc.metadata.get('source', 'Unknown source') for doc in result["source_documents"]]
         response = {
         urls = [
             "https://en.wikipedia.org/wiki/Artificial_intelligence",
             "https://en.wikipedia.org/wiki/Machine_learning"
+        ]
+        # pdfs = ["/home/onur/WORK/DS/repos/chat_with_docs/docs/the-big-book-of-mlops-v10-072023 - Databricks.pdf"]
     )
     response = rag.ask_QAbot("What is Machine Learning?")
     print(f"Question: {response['question']}")