code-weaver / src /utils /load_and_split.py
whitelotus0's picture
code weaver
fff1c68
# src/utils/load_and_split.py
import os
import pathspec
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def load_docs(root_dir, file_extensions=None):
"""
Load documents from the specified root directory.
Ignore dotfiles, dot directories, and files that match .gitignore rules.
Optionally filter by file extensions.
"""
docs = []
# Load .gitignore rules
gitignore_path = os.path.join(root_dir, ".gitignore")
if os.path.isfile(gitignore_path):
with open(gitignore_path, "r") as gitignore_file:
gitignore = gitignore_file.read()
spec = pathspec.PathSpec.from_lines(
pathspec.patterns.GitWildMatchPattern, gitignore.splitlines()
)
else:
spec = None
for dirpath, dirnames, filenames in os.walk(root_dir):
# Remove dot directories from the list of directory names
dirnames[:] = [d for d in dirnames if not d.startswith(".")]
for file in filenames:
file_path = os.path.join(dirpath, file)
# Skip dotfiles
if file.startswith("."):
continue
# Skip files that match .gitignore rules
if spec and spec.match_file(file_path):
continue
if file_extensions and os.path.splitext(file)[1] not in file_extensions:
continue
try:
loader = TextLoader(file_path, encoding="utf-8")
docs.extend(loader.load_and_split())
except Exception:
pass
return docs
def split_docs(docs):
"""Split the input documents into smaller chunks."""
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
return text_splitter.split_documents(docs)