Spaces:
Paused
Paused
| from enum import Enum | |
| from langchain_community.document_loaders import PyPDFLoader,TextLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter,NLTKTextSplitter,SpacyTextSplitter | |
| separators=[ | |
| "\n\n", | |
| "\n", | |
| " ", | |
| ".", | |
| ",", | |
| "\u200b", # Zero-width space | |
| "\uff0c", # Fullwidth comma | |
| "\u3001", # Ideographic comma | |
| "\uff0e", # Fullwidth full stop | |
| "\u3002", # Ideographic full stop | |
| "", | |
| ] | |
| class ChunkingStrategy(Enum): | |
| RECURSIVE_CHARACTER_CHAR_SPLITTER = "recursive_character_char_splitter" | |
| NLTK_TEXT_SPLITTER = "nltk_text_splitter" | |
| SPACY_TEXT_SPLITTER = "spacy_text_splitter" | |
| class TextLoaderAndSplitterWrapper: | |
| def __init__(self, strategy: ChunkingStrategy, file_path:str): | |
| # Defaults | |
| self.splitter = None | |
| self.documents = [] | |
| # Determine with splitter strategy to use from parameter | |
| if strategy == ChunkingStrategy.RECURSIVE_CHARACTER_CHAR_SPLITTER: | |
| self.splitter = RecursiveCharacterTextSplitter(separators=separators) | |
| elif strategy == ChunkingStrategy.NLTK_TEXT_SPLITTER: | |
| self.splitter = NLTKTextSplitter() | |
| elif strategy == ChunkingStrategy.SPACY_TEXT_SPLITTER: | |
| self.splitter = SpacyTextSplitter() | |
| else: | |
| raise ValueError(f"Unknown strategy: {strategy}") | |
| # Load the document and chunk it | |
| self.file_path = file_path | |
| def load_documents(self): | |
| if self.file_path.endswith(".pdf"): | |
| # Use PDF loader | |
| pdf_loader = PyPDFLoader(self.file_path) | |
| self.documents = pdf_loader.load_and_split(text_splitter=self.splitter) # Defaults to RecursiveCharacterTextSplitter. | |
| return self.documents | |
| elif self.file_path.endswith(".txt"): | |
| # Use Text loader | |
| text_loader = TextLoader(self.file_path) | |
| self.documents = text_loader.load_and_split(text_splitter=self.splitter) | |
| return self.documents | |
| else: | |
| raise ValueError(f"Unknown file type: {self.file_path}") | |
| def split(self, text: str): | |
| return self.splitter.split(text) | |
| def join(self, chunks: list): | |
| return self.splitter.join(chunks) | |
| def __str__(self): | |
| return f"TextLoaderAndSplitterWrapper(splitter={self.splitter})" | |
| def __repr__(self): | |
| return str(self) |