Spaces:

OnurKerimoglu
/

rag_chat

Sleeping

App Files Files Community

OnurKerimoglu commited on Dec 7, 2024

Commit

c6fa9db

1 Parent(s): 3446837

introduced src.rag.py

Browse files

Files changed (1) hide show

src/rag.py +143 -0

src/rag.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import dotenv
+import os
+from langchain.document_loaders import UnstructuredURLLoader, PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+# from langchain.embeddings import OpenAIEmbeddings
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain.vectorstores import Chroma
+from langchain.chat_models import ChatOpenAI
+from langchain.chains import RetrievalQA
+from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
+from tqdm import tqdm
+class RAG():
+    def __init__(
+            self,
+            urls=[],
+            pdfs=[],
+            k=3):
+        # Input arguments
+        self.urls = urls  # Source URLS to encode in vectorestore
+        self.pdfs = pdfs  # Source PDFs to encode in vectorestore
+        self.k = 3  # Number of relevant chunks to retrieve
+        # Load environment variables that should contain a 'OPENAI_API_KEY'
+        dotenv.load_dotenv(dotenv.find_dotenv())
+        # Placeholders:
+        self.QAbot = None
+        # Setup the bots
+        self.setup_rag_bots()
+    def load_data(self, urls, pdfs):
+        documents = []
+        if urls:
+            url_loader = UnstructuredURLLoader(urls=urls)
+            documents.extend(url_loader.load())
+        for pdf in pdfs:
+            pdf_loader = PyPDFLoader(pdf)
+            documents.extend(pdf_loader.load())
+        return documents
+    def sources_to_texts(self, urls, pdfs):
+        documents = self.load_data(urls, pdfs)
+        # Retrieval system
+        chunk_size = 1000
+        chunk_overlap = 200
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=chunk_size,
+            chunk_overlap=chunk_overlap)
+        texts = text_splitter.split_documents(documents)
+        return texts
+    def create_embeddings(self):
+        # embeddings = OpenAIEmbeddings()
+        embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
+        return embeddings
+    def create_retriever(self, texts, embeddings):
+        # Create embeddings and vector store
+        vectorstore = Chroma.from_documents(texts, embeddings)
+        retriever = vectorstore.as_retriever(search_kwargs={"k": self.k})
+        return retriever
+    def create_llm(self):
+        # Create the language model
+        llm = ChatOpenAI(
+            model_name="gpt-4o-mini",
+            temperature=0)
+        return llm
+    def create_QAbot(self, retriever, llm):
+        # Create a QAbot
+        # System prompt and prompt template
+        system_template = """You are an AI assistant that answers questions based on the given context.
+        Your responses should be informative and relevant to the question asked.
+        If you don't know the answer or if the information is not present in the context, say so."""
+        human_template = """Context: {context}
+        Question: {question}
+        Answer: """
+        # Create the prompt
+        system_message_prompt = SystemMessagePromptTemplate.from_template(system_template)
+        human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)
+        prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
+        QAbot = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type="stuff",
+            retriever=retriever,
+            return_source_documents=True,
+            chain_type_kwargs={"prompt": prompt}
+        )
+        return QAbot
+    def setup_rag_bots(self):
+        # Initial data
+        texts = self.sources_to_texts(self.urls, self.pdfs)
+        # Create embeddings
+        embeddings = self.create_embeddings()
+        # Create the retriever
+        retriever = self.create_retriever(texts, embeddings)
+        # Create the llm and prompts
+        llm = self.create_llm()
+        # Create a QA bot
+        self.QAbot = self.create_QAbot(
+            retriever,
+            llm
+        )
+    def ask_QAbot(self, question):
+        result = self.QAbot({"query": question})
+        sources = [doc.metadata.get('source', 'Unknown source') for doc in result["source_documents"]]
+        response = {
+            "question": question,
+            "answer": result["result"],
+            "sources": sources
+        }
+        return response
+if __name__ == "__main__":
+    rag = RAG(
+        urls = [
+            "https://en.wikipedia.org/wiki/Artificial_intelligence",
+            "https://en.wikipedia.org/wiki/Machine_learning"
+        ],
+        pdfs = ["/home/onur/WORK/DS/repos/chat_with_docs/docs/the-big-book-of-mlops-v10-072023 - Databricks.pdf"]
+    )
+    response = rag.ask_QAbot("What is Machine Learning?")
+    print(f"Question: {response['question']}")
+    print(f"Answer: {response['answer']}")
+    print("Sources:")
+    for source in response['sources']:
+        print(f"- {source}")