omnibook-base

Sleeping

App Files Files Community

omnibook-base / src /loader.py

REXPro

Update src/loader.py

1b4c1e2 verified 8 days ago

Raw

History Blame Contribute Delete

1.94 kB

	from langchain_community.document_loaders import PyPDFLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	import re

	def clean_text(text: str) -> str:
	"""
	Membersihkan teks PDF dari newline (\n) yang memotong kalimat.
	Menyisakan double newline (\n\n) sebagai batas paragraf.
	"""
	# Mengganti single newline dengan spasi (menggabungkan kalimat yang terputus)
	text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
	# Membersihkan spasi ganda yang mungkin terjadi
	text = re.sub(r'\s{2,}', ' ', text)
	return text.strip()

	def load_pdf(file_path: str):
	"""
	Load a pdf file, clean the text, and return each page as a Document object.
	"""
	loader = PyPDFLoader(file_path)
	documents = loader.load()

	# Bersihkan teks di setiap halaman sebelum di-split
	for doc in documents:
	doc.page_content = clean_text(doc.page_content)

	print(f"✅ Loaded {len(documents)} pages from {file_path}")
	return documents

	def split_documents(documents, chunk_size=1000, chunk_overlap=200):
	"""
	Split documents using RecursiveCharacterTextSplitter with explicit separators
	to ensure it respects paragraphs and sentences.
	"""
	splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=chunk_overlap,
	# Separators ini ngasih tau LangChain:
	# "Coba potong di paragraf dulu (\n\n), kalau kepanjangan potong di kalimat (.), baru per kata ( )"
	separators=["\n\n", "(?<=\. )", " ", ""]
	)
	chunks = splitter.split_documents(documents)
	print(f"✂️ Split into {len(chunks)} chunks")
	return chunks

	# Test for the function
	if __name__ == "__main__":
	# Path-nya pastikan sesuai kalau mau ditest langsung
	docs = load_pdf("data/Md_Reja_E_Rabbi_Tonmoy.pdf")
	chunks = split_documents(docs)

	print("\n--- Contoh Chunk Pertama ---")
	print(chunks[0].page_content)