Spaces:

Dinesh310
/

demo2

Sleeping

App Files Files Community

demo2 / src /document_ingestion /document_processor.py

Dinesh310

Upload 29 files

8d56dc3 verified about 1 month ago

raw

history blame

3.57 kB

	"""Document processing module for loading and splitting documents"""

	from typing import List
	from langchain_community.document_loaders import WebBaseLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from langchain.schema import Document

	from typing import List, Union
	from pathlib import Path
	from langchain_community.document_loaders import (
	WebBaseLoader,
	PyPDFLoader,
	TextLoader,
	PyPDFDirectoryLoader
	)

	class DocumentProcessor:
	"""Handles document loading and processing"""

	def __init__(self, chunk_size: int = 500, chunk_overlap: int = 50):
	"""
	Initialize document processor

	Args:
	chunk_size: Size of text chunks
	chunk_overlap: Overlap between chunks
	"""
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap
	)
	def load_from_url(self, url: str) -> List[Document]:
	"""Load document(s) from a URL"""
	loader = WebBaseLoader(url)
	return loader.load()

	def load_from_pdf_dir(self, directory: Union[str, Path]) -> List[Document]:
	"""Load documents from all PDFs inside a directory"""
	loader = PyPDFDirectoryLoader(str(directory))
	return loader.load()

	def load_from_txt(self, file_path: Union[str, Path]) -> List[Document]:
	"""Load document(s) from a TXT file"""
	loader = TextLoader(str(file_path), encoding="utf-8")
	return loader.load()

	def load_from_pdf(self, file_path: Union[str, Path]) -> List[Document]:
	"""Load document(s) from a PDF file"""
	loader = PyPDFDirectoryLoader(str("data"))
	return loader.load()

	def load_documents(self, sources: List[str]) -> List[Document]:
	"""
	Load documents from URLs, PDF directories, or TXT files

	Args:
	sources: List of URLs, PDF folder paths, or TXT file paths

	Returns:
	List of loaded documents
	"""
	docs: List[Document] = []
	for src in sources:
	if src.startswith("http://") or src.startswith("https://"):
	docs.extend(self.load_from_url(src))

	path = Path("data")
	if path.is_dir(): # PDF directory
	docs.extend(self.load_from_pdf_dir(path))
	elif path.suffix.lower() == ".txt":
	docs.extend(self.load_from_txt(path))
	else:
	raise ValueError(
	f"Unsupported source type: {src}. "
	"Use URL, .txt file, or PDF directory."
	)
	return docs

	def split_documents(self, documents: List[Document]) -> List[Document]:
	"""
	Split documents into chunks

	Args:
	documents: List of documents to split

	Returns:
	List of split documents
	"""
	return self.splitter.split_documents(documents)

	def process_urls(self, urls: List[str]) -> List[Document]:
	"""
	Complete pipeline to load and split documents

	Args:
	urls: List of URLs to process

	Returns:
	List of processed document chunks
	"""
	docs = self.load_documents(urls)
	return self.split_documents(docs)