Spaces:

ahmzakif
/

Fraud-Chatbot

Sleeping

App Files Files Community

Fraud-Chatbot / src /rag /document_loader.py

ahmzakif

feat: add new project

fd99b61 verified 15 days ago

raw

history blame contribute delete

3.76 kB

	"""Document loader for PDF files."""

	import logging
	from pathlib import Path
	from typing import List

	from langchain_community.document_loaders import PyPDFLoader
	from langchain_core.documents import Document
	from langchain_text_splitters import RecursiveCharacterTextSplitter

	from src.config.config import settings

	logger = logging.getLogger(__name__)


	class DocumentLoader:
	"""Loader for PDF documents."""

	def __init__(
	self,
	chunk_size: int = 1000,
	chunk_overlap: int = 200,
	) -> None:
	"""Initialize document loader.

	Args:
	chunk_size: Size of text chunks.
	chunk_overlap: Overlap between chunks.
	"""
	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap
	self.text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	length_function=len,
	)

	def load_pdf(self, pdf_path: Path) -> List[Document]:
	"""Load a PDF file and split it into chunks.

	Args:
	pdf_path: Path to the PDF file.

	Returns:
	List of document chunks.
	"""
	if not pdf_path.exists():
	raise FileNotFoundError(f"PDF file not found: {pdf_path}")

	try:
	logger.info(f"Loading PDF: {pdf_path}")
	loader = PyPDFLoader(str(pdf_path))
	documents = loader.load()

	# Split documents into chunks
	chunks = self.text_splitter.split_documents(documents)

	logger.info(f"Loaded {len(chunks)} chunks from {pdf_path}")
	return chunks
	except Exception as e:
	logger.error(f"Error loading PDF {pdf_path}: {str(e)}")
	raise

	def load_pdfs_from_directory(self, directory: Path) -> List[Document]:
	"""Load all PDF files from a directory.

	Args:
	directory: Directory containing PDF files.

	Returns:
	List of document chunks from all PDFs.
	"""
	if not directory.exists():
	raise FileNotFoundError(f"Directory not found: {directory}")

	pdf_files = list(directory.glob("*.pdf"))
	if not pdf_files:
	logger.warning(f"No PDF files found in {directory}")
	return []

	all_chunks: List[Document] = []
	for pdf_path in pdf_files:
	try:
	chunks = self.load_pdf(pdf_path)
	all_chunks.extend(chunks)
	except Exception as e:
	logger.error(f"Failed to load {pdf_path}: {str(e)}")
	continue

	logger.info(f"Loaded {len(all_chunks)} total chunks from {len(pdf_files)} PDFs")
	return all_chunks

	def load_csv_insights(self, csv_path: Path, sample_size: int = 1050000) -> List[Document]:
	"""Load insights from CSV file and convert to documents.

	Args:
	csv_path: Path to CSV file.
	sample_size: Number of rows to sample from CSV.

	Returns:
	List of documents generated from CSV insights.
	"""
	try:
	from src.rag.csv_document_generator import CSVDocumentGenerator

	logger.info(f"Loading CSV insights from {csv_path}")
	generator = CSVDocumentGenerator(csv_path, sample_size=sample_size)
	documents = generator.generate_all_documents()

	logger.info(f"Generated {len(documents)} documents from CSV insights")
	return documents
	except Exception as e:
	logger.error(f"Error loading CSV insights: {str(e)}")
	raise