Spaces:
Sleeping
Sleeping
| from dotenv import load_dotenv | |
| from PyPDF2 import PdfReader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from langchain.vectorstores import FAISS | |
| from langchain.llms import HuggingFaceHub | |
| from langchain.memory import ConversationBufferMemory | |
| from langchain.chains import ConversationalRetrievalChain | |
| load_dotenv() | |
| ## Read Multiple PDF files | |
| def read_multiple_pdf(files): | |
| if type(files) == str: | |
| files = list("document\yolo.pdf".split(" ")) | |
| texts = "" | |
| for file in files: | |
| docs = PdfReader(file) | |
| for text in docs.pages: | |
| texts += (text.extract_text()) | |
| return texts | |
| ## Split PDF into chunks | |
| def chunk_docs(document, chunk_size = 500, chunk_overlap = 50, separators="\n"): | |
| """ | |
| Split a document into smaller chunks of text. | |
| Args: | |
| document (str): The document to be chunked. | |
| chunk_size (int, optional): The size of each chunk in characters. Defaults to 500. | |
| chunk_overlap (int, optional): The overlap between adjacent chunks in characters. Defaults to 50. | |
| separators (str, optional): The separators used to split the document into chunks. Defaults to "\\n". | |
| Returns: | |
| str: The chunked document. | |
| """ | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| separators=separators, | |
| chunk_size = chunk_size, | |
| chunk_overlap = chunk_overlap | |
| ) | |
| chunk = text_splitter.split_text(document) | |
| return chunk | |
| ## Embeds the Data | |
| def embedding_chunks(chunk, model_name = "sentence-transformers/all-MiniLM-L12-v2"): | |
| embeddings = HuggingFaceEmbeddings(model_name=model_name) | |
| vector_stores = FAISS.from_texts(chunk, embeddings) | |
| return vector_stores | |
| ## setup conversational chain | |
| def chain_conversation(vector_stores,config = {'max_new_tokens': 256, 'temperature': 0.1},model_repo = "mistralai/Mixtral-8x7B-Instruct-v0.1"): | |
| llm = HuggingFaceHub(repo_id = model_repo, model_kwargs = config) | |
| memory = ConversationBufferMemory(memory_key= "chat_history", return_messages=True) | |
| conversation_chain = ConversationalRetrievalChain.from_llm(llm= llm, | |
| retriever= vector_stores.as_retriever(search_kwargs={"k": 10}), | |
| memory= memory) | |
| return conversation_chain |