omnibook-base

Sleeping

File size: 1,943 Bytes

f33866d
 
1b4c1e2
 
 
 
 
 
 
 
 
 
 
 
f33866d
 
 
1b4c1e2
f33866d
 
 
1b4c1e2
 
 
 
 
 
f33866d
 
 
 
1b4c1e2
 
f33866d
 
 
 
1b4c1e2
 
 
f33866d
 
1b4c1e2
f33866d
 
 
 
1b4c1e2
 
f33866d
1b4c1e2

from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import re

def clean_text(text: str) -> str:
    """
    Membersihkan teks PDF dari newline (\n) yang memotong kalimat.
    Menyisakan double newline (\n\n) sebagai batas paragraf.
    """
    # Mengganti single newline dengan spasi (menggabungkan kalimat yang terputus)
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
    # Membersihkan spasi ganda yang mungkin terjadi
    text = re.sub(r'\s{2,}', ' ', text)
    return text.strip()

def load_pdf(file_path: str):
    """
    Load a pdf file, clean the text, and return each page as a Document object.
    """
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    
    # Bersihkan teks di setiap halaman sebelum di-split
    for doc in documents:
        doc.page_content = clean_text(doc.page_content)
        
    print(f"✅ Loaded {len(documents)} pages from {file_path}")
    return documents

def split_documents(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents using RecursiveCharacterTextSplitter with explicit separators
    to ensure it respects paragraphs and sentences.
    """
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        # Separators ini ngasih tau LangChain: 
        # "Coba potong di paragraf dulu (\n\n), kalau kepanjangan potong di kalimat (.), baru per kata ( )"
        separators=["\n\n", "(?<=\. )", " ", ""]
    )
    chunks = splitter.split_documents(documents)
    print(f"✂️ Split into {len(chunks)} chunks")
    return chunks

# Test for the function
if __name__ == "__main__":
    # Path-nya pastikan sesuai kalau mau ditest langsung
    docs = load_pdf("data/Md_Reja_E_Rabbi_Tonmoy.pdf")
    chunks = split_documents(docs) 
    
    print("\n--- Contoh Chunk Pertama ---")
    print(chunks[0].page_content)