omnibook-base / src /loader.py
REXPro's picture
Update src/loader.py
1b4c1e2 verified
Raw
History Blame Contribute Delete
1.94 kB
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import re
def clean_text(text: str) -> str:
"""
Membersihkan teks PDF dari newline (\n) yang memotong kalimat.
Menyisakan double newline (\n\n) sebagai batas paragraf.
"""
# Mengganti single newline dengan spasi (menggabungkan kalimat yang terputus)
text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
# Membersihkan spasi ganda yang mungkin terjadi
text = re.sub(r'\s{2,}', ' ', text)
return text.strip()
def load_pdf(file_path: str):
"""
Load a pdf file, clean the text, and return each page as a Document object.
"""
loader = PyPDFLoader(file_path)
documents = loader.load()
# Bersihkan teks di setiap halaman sebelum di-split
for doc in documents:
doc.page_content = clean_text(doc.page_content)
print(f"✅ Loaded {len(documents)} pages from {file_path}")
return documents
def split_documents(documents, chunk_size=1000, chunk_overlap=200):
"""
Split documents using RecursiveCharacterTextSplitter with explicit separators
to ensure it respects paragraphs and sentences.
"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
# Separators ini ngasih tau LangChain:
# "Coba potong di paragraf dulu (\n\n), kalau kepanjangan potong di kalimat (.), baru per kata ( )"
separators=["\n\n", "(?<=\. )", " ", ""]
)
chunks = splitter.split_documents(documents)
print(f"✂️ Split into {len(chunks)} chunks")
return chunks
# Test for the function
if __name__ == "__main__":
# Path-nya pastikan sesuai kalau mau ditest langsung
docs = load_pdf("data/Md_Reja_E_Rabbi_Tonmoy.pdf")
chunks = split_documents(docs)
print("\n--- Contoh Chunk Pertama ---")
print(chunks[0].page_content)