# src/utils/load_and_split.py import os import pathspec from langchain_community.document_loaders import TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter def load_docs(root_dir, file_extensions=None): """ Load documents from the specified root directory. Ignore dotfiles, dot directories, and files that match .gitignore rules. Optionally filter by file extensions. """ docs = [] # Load .gitignore rules gitignore_path = os.path.join(root_dir, ".gitignore") if os.path.isfile(gitignore_path): with open(gitignore_path, "r") as gitignore_file: gitignore = gitignore_file.read() spec = pathspec.PathSpec.from_lines( pathspec.patterns.GitWildMatchPattern, gitignore.splitlines() ) else: spec = None for dirpath, dirnames, filenames in os.walk(root_dir): # Remove dot directories from the list of directory names dirnames[:] = [d for d in dirnames if not d.startswith(".")] for file in filenames: file_path = os.path.join(dirpath, file) # Skip dotfiles if file.startswith("."): continue # Skip files that match .gitignore rules if spec and spec.match_file(file_path): continue if file_extensions and os.path.splitext(file)[1] not in file_extensions: continue try: loader = TextLoader(file_path, encoding="utf-8") docs.extend(loader.load_and_split()) except Exception: pass return docs def split_docs(docs): """Split the input documents into smaller chunks.""" text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) return text_splitter.split_documents(docs)