selfevolveagent / evoagentx /rag /readers /base.py

Upload 2846 files

5374a2d verified 19 days ago

7.41 kB

	import asyncio
	from pathlib import Path
	from typing import Union, List, Tuple, Optional, Callable, Dict

	from llama_index.core import SimpleDirectoryReader

	from evoagentx.rag.schema import Document
	from evoagentx.core.logging import logger


	# You Could fllow the llama_index tutorial to develop a valid Reader for new file format:
	# https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader/
	class LLamaIndexReader:
	"""A universal file reader based on LlamaIndex's SimpleDirectoryReader.

	This class provides a flexible interface for loading documents from files or directories,
	supporting various formats (e.g., PDF, Word, Markdown) with customizable filtering and metadata.

	Attributes:
	recursive (bool): Whether to recursively load files from directories.
	exclude_hidden (bool): Whether to exclude hidden files (starting with '.').
	num_workers (Optional[int]): Number of worker threads for parallel loading.
	num_files_limits (Optional[int]): Maximum number of files to load.
	custom_metadata_function (Optional[Callable]): Custom function to extract metadata.
	extern_file_extractor (Optional[Dict]): Custom file extractors for specific file types.
	errors (str): Error handling strategy for file reading (e.g., 'ignore', 'strict').
	encoding (str): File encoding (default: 'utf-8').
	"""

	def __init__(
	self,
	recursive: bool = False,
	exclude_hidden: bool = True,
	num_workers: Optional[int] = None,
	num_files_limits: Optional[int] = None,
	custom_metadata_function: Optional[Callable] = None,
	extern_file_extractor: Optional[Dict] = None,
	errors: str = "ignore",
	encoding: str = "utf-8",
	):
	self.recursive = recursive
	self.exclude_hidden = exclude_hidden
	self.num_workers = num_workers
	self.num_files_limits = num_files_limits
	self.custom_metadata_function = custom_metadata_function
	self.extern_file_extractor = extern_file_extractor
	self.errors = errors
	self.encoding = encoding

	def _validate_path(self, path: Union[str, Path]) -> Path:
	"""Validate and convert a path to a Path object.

	Args:
	path: A string or Path object representing a file or directory.

	Returns:
	Path: A validated Path object.

	Raises:
	FileNotFoundError: If the path does not exist.
	ValueError: If the path is invalid.
	"""
	path = Path(path)
	if not path.exists():
	logger.error(f"Path does not exist: {path}")
	raise FileNotFoundError(f"Path does not exist: {path}")
	return path

	def _check_input(
	self, input_data: Union[str, List, Tuple], is_file: bool = True
	) -> Union[List[Path], Path]:
	"""Check input to a list of Path objects or a single Path for directories.

	Args:
	input_data: A string, list, or tuple of file/directory paths.
	is_file: Whether to treat input as file paths (True) or directory (False).

	Returns:
	Union[List[Path], Path]: Valied file paths or directory path.

	Raises:
	ValueError: If input type is invalid.
	"""
	if isinstance(input_data, str):
	return self._validate_path(input_data)
	elif isinstance(input_data, (list, tuple)):
	if is_file:
	return [self._validate_path(p) for p in input_data]
	else:
	return self._validate_path(input_data[0])
	else:
	logger.error(f"Invalid input type: {type(input_data)}")
	raise ValueError(f"Invalid input type: {type(input_data)}")

	def load(
	self,
	file_paths: Union[str, List, Tuple],
	exclude_files: Optional[Union[str, List, Tuple]] = None,
	filter_file_by_suffix: Optional[Union[str, List, Tuple]] = None,
	merge_by_file: bool = False,
	show_progress: bool = False,
	use_async: bool = False,
	) -> List[Document]:
	"""Load documents from files or directories.

	Args:
	file_paths: A string, list, or tuple of file paths or a directory path.
	exclude_files: Files to exclude from loading.
	filter_file_by_suffix: File extensions to include (e.g., ['.pdf', '.docx']).

	Returns:
	List[Document]: List of loaded documents.

	Raises:
	FileNotFoundError: If input paths are invalid.
	RuntimeError: If document loading fails.
	"""
	try:
	input_files = None
	input_dir = None
	if isinstance(file_paths, (list, tuple)):
	input_files = self._check_input(file_paths, is_file=True)
	else:
	path = self._check_input(file_paths, is_file=False)
	if path.is_dir():
	input_dir = path
	else:
	input_files = [path]

	exclude_files = (
	self._check_input(exclude_files, is_file=True)
	if exclude_files
	else None
	)
	filter_file_by_suffix = (
	list(filter_file_by_suffix)
	if isinstance(filter_file_by_suffix, (list, tuple))
	else [filter_file_by_suffix]
	if isinstance(filter_file_by_suffix, str)
	else None
	)

	reader = SimpleDirectoryReader(
	input_dir=input_dir,
	input_files=input_files,
	exclude=exclude_files,
	exclude_hidden=self.exclude_hidden,
	recursive=self.recursive,
	required_exts=filter_file_by_suffix,
	num_files_limit=self.num_files_limits,
	file_metadata=self.custom_metadata_function,
	file_extractor=self.extern_file_extractor,
	encoding=self.encoding,
	errors=self.errors,
	)

	llama_docs = asyncio.run(reader.aload_data(show_progress=show_progress, num_workers=self.num_workers)) if use_async \
	else reader.load_data(show_progress=show_progress)

	if merge_by_file:
	file_to_docs = {}
	for doc in llama_docs:
	file_path = doc.metadata.get("file_path", "")
	if file_path not in file_to_docs:
	file_to_docs[file_path] = []
	file_to_docs[file_path].append(doc)

	documents = []
	for file_path, docs in file_to_docs.items():
	combined_text = "\n".join(doc.text for doc in docs)
	combined = docs[0].copy()
	combined.text_resource.text = combined_text
	combined.metadata["page_count"] = len(docs)
	documents.append(Document.from_llama_document(combined))
	else:
	documents = [Document.from_llama_document(doc) for doc in llama_docs]
	logger.info(f"Loaded {len(documents)} documents")
	return documents

	except Exception as e:
	logger.error(f"Failed to load documents: {str(e)}")
	raise RuntimeError(f"Failed to load documents: {str(e)}")