|
|
import re |
|
|
from pathlib import Path |
|
|
|
|
|
from langchain_community.document_loaders.git import GitLoader |
|
|
|
|
|
from langflow.custom import Component |
|
|
from langflow.io import MessageTextInput, Output |
|
|
from langflow.schema import Data |
|
|
|
|
|
|
|
|
class GitLoaderComponent(Component): |
|
|
display_name = "GitLoader" |
|
|
description = "Load files from a Git repository" |
|
|
documentation = "https://python.langchain.com/v0.2/docs/integrations/document_loaders/git/" |
|
|
trace_type = "tool" |
|
|
icon = "GitLoader" |
|
|
name = "GitLoader" |
|
|
|
|
|
inputs = [ |
|
|
MessageTextInput( |
|
|
name="repo_path", |
|
|
display_name="Repository Path", |
|
|
required=False, |
|
|
info="The local path to the Git repository.", |
|
|
), |
|
|
MessageTextInput( |
|
|
name="clone_url", |
|
|
display_name="Clone URL", |
|
|
required=False, |
|
|
info="The URL to clone the Git repository from.", |
|
|
), |
|
|
MessageTextInput( |
|
|
name="branch", |
|
|
display_name="Branch", |
|
|
required=False, |
|
|
value="main", |
|
|
info="The branch to load files from. Defaults to 'main'.", |
|
|
), |
|
|
MessageTextInput( |
|
|
name="file_filter", |
|
|
display_name="File Filter", |
|
|
required=False, |
|
|
advanced=True, |
|
|
info="A list of patterns to filter files. Example to include only .py files: '*.py'. " |
|
|
"Example to exclude .py files: '!*.py'. Multiple patterns can be separated by commas.", |
|
|
), |
|
|
MessageTextInput( |
|
|
name="content_filter", |
|
|
display_name="Content Filter", |
|
|
required=False, |
|
|
advanced=True, |
|
|
info="A regex pattern to filter files based on their content.", |
|
|
), |
|
|
] |
|
|
|
|
|
outputs = [ |
|
|
Output(name="data", display_name="Data", method="load_documents"), |
|
|
] |
|
|
|
|
|
@staticmethod |
|
|
def is_binary(file_path: str) -> bool: |
|
|
"""Check if a file is binary by looking for null bytes. |
|
|
|
|
|
This is necessary because when searches are performed using |
|
|
the content_filter, binary files need to be ignored. |
|
|
""" |
|
|
with Path(file_path).open("rb") as file: |
|
|
return b"\x00" in file.read(1024) |
|
|
|
|
|
def build_gitloader(self) -> GitLoader: |
|
|
file_filter_patterns = getattr(self, "file_filter", None) |
|
|
content_filter_pattern = getattr(self, "content_filter", None) |
|
|
|
|
|
file_filters = [] |
|
|
if file_filter_patterns: |
|
|
patterns = [pattern.strip() for pattern in file_filter_patterns.split(",")] |
|
|
|
|
|
def file_filter(file_path: Path) -> bool: |
|
|
if len(patterns) == 1 and patterns[0].startswith("!"): |
|
|
return not file_path.match(patterns[0][1:]) |
|
|
included = any(file_path.match(pattern) for pattern in patterns if not pattern.startswith("!")) |
|
|
excluded = any(file_path.match(pattern[1:]) for pattern in patterns if pattern.startswith("!")) |
|
|
return included and not excluded |
|
|
|
|
|
file_filters.append(file_filter) |
|
|
|
|
|
if content_filter_pattern: |
|
|
content_regex = re.compile(content_filter_pattern) |
|
|
|
|
|
def content_filter(file_path: Path) -> bool: |
|
|
content = file_path.read_text(encoding="utf-8", errors="ignore") |
|
|
return bool(content_regex.search(content)) |
|
|
|
|
|
file_filters.append(content_filter) |
|
|
|
|
|
def combined_filter(file_path: str) -> bool: |
|
|
path = Path(file_path) |
|
|
if self.is_binary(file_path): |
|
|
return False |
|
|
return all(f(path) for f in file_filters) |
|
|
|
|
|
return GitLoader( |
|
|
repo_path=self.repo_path, |
|
|
clone_url=self.clone_url, |
|
|
branch=self.branch, |
|
|
file_filter=combined_filter, |
|
|
) |
|
|
|
|
|
def load_documents(self) -> list[Data]: |
|
|
gitloader = self.build_gitloader() |
|
|
documents = list(gitloader.lazy_load()) |
|
|
data = [Data.from_document(doc) for doc in documents] |
|
|
self.status = data |
|
|
return data |
|
|
|