Spaces:

kamkol
/

AB_Testing_RAG

Sleeping

App Files Files Community

AB_Testing_RAG / aimakerspace /text_utils.py

kamkol

Fix PDFLoader to process pages individually for correct page numbering

6b569cb 11 months ago

raw

history blame contribute delete

5.03 kB

	import os
	import re
	from typing import List
	import PyPDF2


	class TextFileLoader:
	def __init__(self, path: str, encoding: str = "utf-8"):
	self.documents = []
	self.path = path
	self.encoding = encoding

	def load(self):
	if os.path.isdir(self.path):
	self.load_directory()
	else:
	self.load_file()

	def load_file(self):
	with open(self.path, "r", encoding=self.encoding) as f:
	self.documents.append(f.read())

	def load_directory(self):
	for root, _, files in os.walk(self.path):
	for file in files:
	if file.endswith(".txt"):
	with open(
	os.path.join(root, file), "r", encoding=self.encoding
	) as f:
	self.documents.append(f.read())

	def load_documents(self):
	self.load()
	return self.documents


	class CharacterTextSplitter:
	def __init__(
	self,
	chunk_size: int = 1000,
	chunk_overlap: int = 200,
	):
	assert (
	chunk_size > chunk_overlap
	), "Chunk size must be greater than chunk overlap"

	self.chunk_size = chunk_size
	self.chunk_overlap = chunk_overlap

	def split(self, text: str) -> List[str]:
	chunks = []
	for i in range(0, len(text), self.chunk_size - self.chunk_overlap):
	chunks.append(text[i : i + self.chunk_size])
	return chunks

	def split_texts(self, texts: List[str]) -> List[str]:
	chunks = []
	for text in texts:
	chunks.extend(self.split(text))
	return chunks


	class PDFLoader:
	def __init__(self, path: str):
	self.documents = []
	self.path = path
	print(f"PDFLoader initialized with path: {self.path}")

	def load(self):
	print(f"Loading PDF from path: {self.path}")
	print(f"Path exists: {os.path.exists(self.path)}")
	print(f"Is file: {os.path.isfile(self.path)}")
	print(f"Is directory: {os.path.isdir(self.path)}")
	print(f"File permissions: {oct(os.stat(self.path).st_mode)[-3:]}")

	try:
	# Try to open the file first to verify access
	with open(self.path, 'rb') as test_file:
	pass

	# If we can open it, proceed with loading
	if os.path.isdir(self.path):
	self.load_directory()
	else:
	self.load_file()

	except IOError as e:
	raise ValueError(f"Cannot access file at '{self.path}': {str(e)}")
	except Exception as e:
	raise ValueError(f"Error processing file at '{self.path}': {str(e)}")

	def load_file(self):
	with open(self.path, 'rb') as file:
	# Create PDF reader object
	pdf_reader = PyPDF2.PdfReader(file)

	# Extract text from each page separately
	self.documents = [] # Clear existing documents
	for page_num, page in enumerate(pdf_reader.pages):
	page_text = page.extract_text() or "" # Handle None returns
	if page_text.strip(): # Only add non-empty pages
	self.documents.append(page_text)
	else:
	print(f"Warning: Page {page_num + 1} is empty in {os.path.basename(self.path)}")

	print(f"Loaded {len(self.documents)} pages from {os.path.basename(self.path)}")

	def load_directory(self):
	self.documents = [] # Clear existing documents
	for root, _, files in os.walk(self.path):
	for file in files:
	if file.lower().endswith('.pdf'):
	file_path = os.path.join(root, file)
	try:
	with open(file_path, 'rb') as f:
	pdf_reader = PyPDF2.PdfReader(f)

	# Extract text from each page separately
	for page_num, page in enumerate(pdf_reader.pages):
	page_text = page.extract_text() or ""
	if page_text.strip():
	self.documents.append(page_text)
	else:
	print(f"Warning: Page {page_num + 1} is empty in {file}")
	except Exception as e:
	print(f"Error processing {file}: {str(e)}")

	def load_documents(self):
	if not self.documents: # Only load if not already loaded
	self.load()
	return self.documents


	if __name__ == "__main__":
	loader = TextFileLoader("data/KingLear.txt")
	loader.load()
	splitter = CharacterTextSplitter()
	chunks = splitter.split_texts(loader.documents)
	print(len(chunks))
	print(chunks[0])
	print("--------")
	print(chunks[1])
	print("--------")
	print(chunks[-2])
	print("--------")
	print(chunks[-1])