| |
| from langchain.text_splitter import RecursiveCharacterTextSplitter |
| from transformers import AutoTokenizer |
| from sentence_transformers import SentenceTransformer |
| from langchain_community.document_loaders import PyPDFLoader |
| from langchain.embeddings import HuggingFaceEmbeddings |
| from langchain.vectorstores import FAISS |
| from transformers import AutoTokenizer, AutoModelForQuestionAnswering,pipeline |
| from transformers import AutoTokenizer, pipeline |
| from langchain.docstore.document import Document as LangchainDocument |
| from typing import List, Optional |
| |
| |
|
|
| |
|
|
|
|
| def split_documents( |
| chunk_size: int, |
| knowledge_base: List[LangchainDocument], |
| |
| separator:List[str]=None, |
| ) -> List[LangchainDocument]: |
| """ |
| Split documents into chunks of maximum size `chunk_size` tokens and return a list of documents. |
| """ |
| |
| |
| |
| |
| |
| |
| |
| |
| text_splitter= RecursiveCharacterTextSplitter( chunk_size=chunk_size, |
| chunk_overlap=int(chunk_size / 10), |
| strip_whitespace=True, |
| separators=separator) |
|
|
| docs_processed = [] |
| |
| |
| docs_processed=text_splitter.split_documents(knowledge_base) |
| |
| unique_texts = {} |
| docs_processed_unique = [] |
| for doc in docs_processed: |
| if doc.page_content not in unique_texts: |
| unique_texts[doc.page_content] = True |
| docs_processed_unique.append(doc) |
|
|
| return docs_processed_unique |