Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_community.docstore.document import Document | |
| from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter | |
| from langchain_community.vectorstores import Chroma | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain_community.retrievers import BM25Retriever | |
| from langchain_community.llms import OpenAI | |
| from langchain_openai import ChatOpenAI | |
| from langchain.chains import RetrievalQA | |
| from langchain.schema import AIMessage, HumanMessage | |
| from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder | |
| import os | |
| from langchain.retrievers import ParentDocumentRetriever | |
| from langchain.storage import InMemoryStore | |
| def split_with_source(text, source): | |
| splitter = CharacterTextSplitter( | |
| separator = "\n", | |
| chunk_size = 400, | |
| chunk_overlap = 0, | |
| length_function = len, | |
| add_start_index = True, | |
| ) | |
| documents = splitter.create_documents([text]) | |
| # print(documents) | |
| for doc in documents: | |
| doc.metadata["source"] = source | |
| # print(doc.metadata) | |
| return documents | |
| def get_document_from_raw_text_each_line(): | |
| documents = [Document(page_content="", metadata={'source': 0})] | |
| files = os.listdir(os.path.join(os.getcwd(), "raw_data")) | |
| # print(files) | |
| for i in files: | |
| file_path = i | |
| with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file: | |
| # Xử lý bằng text_spliter | |
| # Tiền xử lý văn bản | |
| content = file.readlines() | |
| text = [] | |
| #Split | |
| for line in content: | |
| line = line.strip() | |
| documents.append(Document(page_content=line, metadata={"source": i})) | |
| return documents | |
| def count_files_in_folder(folder_path): | |
| # Kiểm tra xem đường dẫn thư mục có tồn tại không | |
| if not os.path.isdir(folder_path): | |
| print("Đường dẫn không hợp lệ.") | |
| return None | |
| # Sử dụng os.listdir() để lấy danh sách các tập tin và thư mục trong thư mục | |
| files = os.listdir(folder_path) | |
| # Đếm số lượng tập tin trong danh sách | |
| file_count = len(files) | |
| return file_count | |
| def get_document_from_raw_text(): | |
| documents = [Document(page_content="", metadata={'source': 0})] | |
| files = os.listdir(os.path.join(os.getcwd(), "raw_data")) | |
| # print(files) | |
| for i in files: | |
| file_path = i | |
| with open(os.path.join(os.path.join(os.getcwd(), "raw_data"),file_path), 'r', encoding="utf-8") as file: | |
| # Xử lý bằng text_spliter | |
| # Tiền xử lý văn bản | |
| content = file.read().replace('\n\n', "\n") | |
| # content = ''.join(content.split('.')) | |
| new_doc = content | |
| texts = split_with_source(new_doc, i) | |
| # texts = get_document_from_raw_text_each_line() | |
| documents = documents + texts | |
| ##Xử lý mỗi khi xuống dòng | |
| # for line in file: | |
| # # Loại bỏ khoảng trắng thừa và ký tự xuống dòng ở đầu và cuối mỗi dòng | |
| # line = line.strip() | |
| # documents.append(Document(page_content=line, metadata={"source": i})) | |
| # print(documents) | |
| return documents | |
| def get_document_from_table(): | |
| documents = [Document(page_content="", metadata={'source': 0})] | |
| files = os.listdir(os.path.join(os.getcwd(), "table_data")) | |
| # print(files) | |
| for i in files: | |
| file_path = i | |
| data = pd.read_csv(os.path.join(os.path.join(os.getcwd(), "table_data"),file_path)) | |
| for j, row in data.iterrows(): | |
| documents.append(Document(page_content=row['data'], metadata={"source": file_path})) | |
| return documents | |
| def load_the_embedding_retrieve(is_ready = False, k = 3, model= 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'): | |
| embeddings = HuggingFaceEmbeddings(model_name=model) | |
| if is_ready: | |
| retriever = Chroma(persist_directory=os.path.join(os.getcwd(), "Data"), embedding_function=embeddings).as_retriever( | |
| search_kwargs={"k": k} | |
| ) | |
| else: | |
| documents = get_document_from_raw_text() + get_document_from_table() | |
| # print(type(documents)) | |
| retriever = Chroma.from_documents(documents, embeddings).as_retriever( | |
| search_kwargs={"k": k} | |
| ) | |
| return retriever | |
| def load_the_bm25_retrieve(k = 3): | |
| documents = get_document_from_raw_text() + get_document_from_table() | |
| bm25_retriever = BM25Retriever.from_documents(documents) | |
| bm25_retriever.k = k | |
| return bm25_retriever | |
| def load_the_parent_document_retrieve(model= 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'): | |
| embeddings = HuggingFaceEmbeddings(model_name=model) | |
| vectorstore = Chroma( | |
| collection_name="split_parents", embedding_function=embeddings | |
| ) | |
| store = InMemoryStore() | |
| parent_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1200, | |
| chunk_overlap=0, | |
| length_function=len, | |
| add_start_index=True, ) | |
| child_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=400, | |
| chunk_overlap=0, | |
| length_function=len, | |
| add_start_index=True, ) | |
| retriever = ParentDocumentRetriever( | |
| vectorstore=vectorstore, | |
| docstore=store, | |
| child_splitter=child_splitter, | |
| parent_splitter=parent_splitter, | |
| ) | |
| docs = get_document_from_raw_text() | |
| retriever.add_documents(docs) | |
| return retriever | |
| def get_qachain(llm_name = "gpt-3.5-turbo-0125", chain_type = "stuff", retriever = None, return_source_documents = True): | |
| llm = ChatOpenAI(temperature=0, | |
| model_name=llm_name) | |
| return RetrievalQA.from_chain_type(llm=llm, | |
| chain_type=chain_type, | |
| retriever=retriever, | |
| return_source_documents=return_source_documents) | |
| def summarize_messages(demo_ephemeral_chat_history, llm): | |
| stored_messages = demo_ephemeral_chat_history.messages | |
| human_chat = stored_messages[0].content | |
| ai_chat = stored_messages[1].content | |
| if len(stored_messages) == 0: | |
| return False | |
| summarization_prompt = ChatPromptTemplate.from_messages( | |
| [ | |
| ( | |
| "system", os.environ['SUMARY_MESSAGE_PROMPT'], | |
| ), | |
| ( | |
| "human", | |
| ''' | |
| History: | |
| Human: {human} | |
| AI: {AI} | |
| Output: | |
| ''' | |
| ) | |
| , | |
| ] | |
| ) | |
| summarization_chain = summarization_prompt | llm | |
| summary_message = summarization_chain.invoke({"AI": ai_chat, "human": human_chat}) | |
| demo_ephemeral_chat_history.clear() | |
| demo_ephemeral_chat_history.add_message(summary_message) | |
| return demo_ephemeral_chat_history | |
| def get_question_from_summarize(summary, question, llm): | |
| new_qa_prompt = ChatPromptTemplate.from_messages([ | |
| ("system", os.environ['NEW_QUESTION_PROMPT']), | |
| ("human", | |
| ''' | |
| Summary: {summary} | |
| Question: {question} | |
| Output: | |
| ''' | |
| ) | |
| ] | |
| ) | |
| new_qa_chain = new_qa_prompt | llm | |
| return new_qa_chain.invoke({'summary': summary, 'question': question}).content | |
| def get_final_answer(question, context, prompt, llm): | |
| qa_prompt = ChatPromptTemplate.from_messages( | |
| [ | |
| ("system", prompt), | |
| ("human", ''' | |
| Context: {context} | |
| Question: {question} | |
| Output: '''), | |
| ] | |
| ) | |
| answer_chain = qa_prompt | llm | |
| answer = answer_chain.invoke({'question': question, 'context': context}) | |
| return answer.content | |
| def process_llm_response(llm_response): | |
| print(llm_response['result']) | |
| print('\n\nSources:') | |
| for source in llm_response["source_documents"]: | |
| print(source.metadata['source']) | |