Spaces:
Sleeping
Sleeping
| # src/utils/load_and_split.py | |
| import os | |
| import pathspec | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| def load_docs(root_dir, file_extensions=None): | |
| """ | |
| Load documents from the specified root directory. | |
| Ignore dotfiles, dot directories, and files that match .gitignore rules. | |
| Optionally filter by file extensions. | |
| """ | |
| docs = [] | |
| # Load .gitignore rules | |
| gitignore_path = os.path.join(root_dir, ".gitignore") | |
| if os.path.isfile(gitignore_path): | |
| with open(gitignore_path, "r") as gitignore_file: | |
| gitignore = gitignore_file.read() | |
| spec = pathspec.PathSpec.from_lines( | |
| pathspec.patterns.GitWildMatchPattern, gitignore.splitlines() | |
| ) | |
| else: | |
| spec = None | |
| for dirpath, dirnames, filenames in os.walk(root_dir): | |
| # Remove dot directories from the list of directory names | |
| dirnames[:] = [d for d in dirnames if not d.startswith(".")] | |
| for file in filenames: | |
| file_path = os.path.join(dirpath, file) | |
| # Skip dotfiles | |
| if file.startswith("."): | |
| continue | |
| # Skip files that match .gitignore rules | |
| if spec and spec.match_file(file_path): | |
| continue | |
| if file_extensions and os.path.splitext(file)[1] not in file_extensions: | |
| continue | |
| try: | |
| loader = TextLoader(file_path, encoding="utf-8") | |
| docs.extend(loader.load_and_split()) | |
| except Exception: | |
| pass | |
| return docs | |
| def split_docs(docs): | |
| """Split the input documents into smaller chunks.""" | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| return text_splitter.split_documents(docs) |