Spaces:
Sleeping
Sleeping
| import dotenv | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import UnstructuredURLLoader, PyPDFLoader | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_openai import ChatOpenAI | |
| from langchain_huggingface import HuggingFaceEndpoint, HuggingFaceEmbeddings | |
| from langchain.chains import RetrievalQA | |
| from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate | |
| class RAG(): | |
| def __init__( | |
| self, | |
| urls=[], | |
| pdfs=[], | |
| k=3): | |
| # Input arguments | |
| self.urls = urls # Source URLS to encode in vectorestore | |
| self.pdfs = pdfs # Source PDFs to encode in vectorestore | |
| self.k = 3 # Number of relevant chunks to retrieve | |
| # Constants | |
| self.use_model = 'gpt-4o-mini' | |
| # self.use_model = 'zephyr-7b-alpha' | |
| # self.use_model = 'zephyr-7b-beta' | |
| # self.use_model = 'Mistral-Nemo-Base-2407' | |
| # self.use_vectordb = 'chroma' | |
| self.use_vectordb = 'faiss' | |
| # Load environment variables that should contain: | |
| # - 'OPENAI_API_KEY' for OpenAI models | |
| # - 'HUGGINGFACEHUB_API_TOKEN' for HuggingFace models | |
| dotenv.load_dotenv(dotenv.find_dotenv()) | |
| # Placeholders: | |
| self.QAbot = None | |
| # Setup the bots | |
| self.setup_rag_bot() | |
| def load_data(self, urls, pdfs): | |
| """ | |
| Loads data from the input URLs and PDFs. | |
| Args: | |
| urls: List of URLs to load. | |
| pdfs: List of PDF files to load. | |
| Returns: | |
| A list of Document objects loaded from the input URLs and PDFs. | |
| """ | |
| documents = [] | |
| if urls: | |
| url_loader = UnstructuredURLLoader(urls=urls) | |
| documents.extend(url_loader.load()) | |
| for pdf in pdfs: | |
| pdf_loader = PyPDFLoader(pdf) | |
| documents.extend(pdf_loader.load()) | |
| return documents | |
| def sources_to_texts(self, documents): | |
| """ | |
| Takes a list of URLs and PDFs and converts them into a list of text chunks. | |
| The text chunks are split into chunks of a certain size with a certain amount of overlap. | |
| Args: | |
| documents: a list of document objects loaded from the input data | |
| Returns: | |
| A list of text chunks. | |
| """ | |
| # Retrieval system | |
| chunk_size = 1000 | |
| chunk_overlap = 200 | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap) | |
| texts = text_splitter.split_documents(documents) | |
| return texts | |
| def create_embeddings(self): | |
| # embeddings = OpenAIEmbeddings() | |
| print ('Using Embeddings from HuggingFace: all-MiniLM-L6-v2') | |
| embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| return embeddings | |
| def create_retriever(self, texts, embeddings): | |
| """ | |
| Creates a retriever from the given texts and embeddings. | |
| Args: | |
| texts: A list of text strings to encode in the vector store. | |
| embeddings: An instance of langchain.Embeddings to use for encoding the texts. | |
| Returns: | |
| An instance of langchain.Retriever. | |
| """ | |
| if self.use_vectordb == 'chroma': | |
| print ('Creating vectore store with Chroma') | |
| vectorstore = Chroma.from_documents(texts, embeddings) | |
| retriever = vectorstore.as_retriever(search_kwargs={"k": self.k}) | |
| elif self.use_vectordb == 'faiss': | |
| print ('Creating vectore store with FAISS') | |
| vectorstore = FAISS.from_documents(texts, embeddings) | |
| retriever = vectorstore.as_retriever(search_kwargs={"k": self.k}) | |
| return retriever | |
| def create_llm(self): | |
| """ | |
| Instantiates a language model based on the specified model type. | |
| This function supports the following models: | |
| - 'gpt-4o-mini' through the ChatOpenAI interface | |
| - 'zephyr-7b-beta' through the HuggingFaceEndpoint with provider: hf-inference | |
| - 'Mistral-Nemo-Base-2407' through the HuggingFaceEndpoint, with provider: novita (at testing stage) | |
| The model is determined by the `self.use_model` attribute. | |
| Returns an instance of the selected language model. | |
| Returns: | |
| llm: An instance of the chosen language model, either ChatOpenAI or HuggingFaceEndpoint. | |
| """ | |
| if self.use_model == 'gpt-4o-mini': | |
| print(f'As llm, using OpenAI model: {self.use_model}') | |
| llm = ChatOpenAI( | |
| model_name="gpt-4o-mini", | |
| temperature=0) | |
| elif self.use_model in ['zephyr-7b-alpha','zephyr-7b-beta'] : | |
| provider = "hf-inference" | |
| print(f'As llm, using HF-Endpint: {self.use_model} through provider: {provider}') | |
| llm = HuggingFaceEndpoint( | |
| repo_id=f"HuggingFaceH4/{self.use_model}", | |
| provider=provider, | |
| temperature=0.1, | |
| max_new_tokens=512, | |
| do_sample=False | |
| ) | |
| elif self.use_model == 'Mistral-Nemo-Base-2407': | |
| provider = "novita" | |
| print(f'As llm, using HF-Endpint: {self.use_model} through provider: {provider}') | |
| llm = HuggingFaceEndpoint( | |
| repo_id=f"mistralai/{self.use_model}", | |
| provider=provider, | |
| temperature=0.1, | |
| max_new_tokens=512, | |
| do_sample=False | |
| ) | |
| return llm | |
| def create_QAbot(self, retriever, llm): | |
| """ | |
| Creates a QAbot (Question-Answering bot) from the given retriever and language model. | |
| The QAbot is a type of RetrievalQA chain built with Langchain that, for a given question: | |
| - uses the given retriever to get the relevant documents | |
| - and the given language model to generate an answer. | |
| Args: | |
| retriever: An instance of langchain.Retriever. | |
| llm: An instance of langchain.LLM. | |
| Returns: | |
| QAbot: An instance of langchain.RetrievalQA. | |
| """ | |
| # System prompt and prompt template | |
| system_template = """You are an AI assistant that answers questions based on the given context. | |
| Your responses should be informative and relevant to the question asked. | |
| If you don't know the answer or if the information is not present in the context, say so.""" | |
| human_template = """Context: {context} | |
| Question: {question} | |
| Answer: """ | |
| # Create the prompt | |
| system_message_prompt = SystemMessagePromptTemplate.from_template(system_template) | |
| human_message_prompt = HumanMessagePromptTemplate.from_template(human_template) | |
| prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt]) | |
| QAbot = RetrievalQA.from_chain_type( | |
| llm=llm, | |
| chain_type="stuff", | |
| retriever=retriever, | |
| return_source_documents=True, | |
| chain_type_kwargs={"prompt": prompt} | |
| ) | |
| return QAbot | |
| def setup_rag_bot(self): | |
| """ | |
| Sets up the RAG bot by: | |
| - loading the data from the input URLs and PDFs | |
| - splitting the data into chunks of text | |
| - creating embeddings for the text chunks | |
| - creating a retriever using the embeddings | |
| - creating a language model and prompts | |
| - and creating a QA bot (Question-Answering bot) using the retriever and language model. | |
| """ | |
| # Initial data | |
| documents = self.load_data(self.urls, self.pdfs) | |
| texts = self.sources_to_texts(documents) | |
| # Create embeddings | |
| embeddings = self.create_embeddings() | |
| # Create the retriever | |
| retriever = self.create_retriever(texts, embeddings) | |
| # Create the llm and prompts | |
| llm = self.create_llm() | |
| # Create a QA bot | |
| self.QAbot = self.create_QAbot( | |
| retriever, | |
| llm | |
| ) | |
| def ask_QAbot(self, question): | |
| """ | |
| Queries the QA bot with a specified question and retrieves the answer along with the sources. | |
| Args: | |
| question (str): The question to be asked to the QA bot. | |
| Returns: | |
| dict: A dictionary containing the question, answer, and sources. | |
| """ | |
| result = self.QAbot.invoke({"query": question}) | |
| sources = [doc.metadata.get('source', 'Unknown source') for doc in result["source_documents"]] | |
| response = { | |
| "question": question, | |
| "answer": result["result"], | |
| "sources": sources | |
| } | |
| return response | |
| if __name__ == "__main__": | |
| rag = RAG( | |
| # urls = [ | |
| # "https://en.wikipedia.org/wiki/Artificial_intelligence", | |
| # "https://en.wikipedia.org/wiki/Machine_learning" | |
| #] | |
| # pdfs = ["/home/onur/WORK/DS/repos/chat_with_docs/docs/the-big-book-of-mlops-v10-072023 - Databricks.pdf"] | |
| pdfs =['/home/onur/Desktop/job_app/Resume_Onur_Kerimoglu.pdf'] | |
| ) | |
| response = rag.ask_QAbot("What technical skills does Onur Kerimoglu possess?") | |
| print(f"Question: {response['question']}") | |
| print(f"Answer: {response['answer']}") | |
| print("Sources:") | |
| for source in response['sources']: | |
| print(f"- {source}") | |