Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| import re | |
| def clean_text(text: str) -> str: | |
| """ | |
| Membersihkan teks PDF dari newline (\n) yang memotong kalimat. | |
| Menyisakan double newline (\n\n) sebagai batas paragraf. | |
| """ | |
| # Mengganti single newline dengan spasi (menggabungkan kalimat yang terputus) | |
| text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text) | |
| # Membersihkan spasi ganda yang mungkin terjadi | |
| text = re.sub(r'\s{2,}', ' ', text) | |
| return text.strip() | |
| def load_pdf(file_path: str): | |
| """ | |
| Load a pdf file, clean the text, and return each page as a Document object. | |
| """ | |
| loader = PyPDFLoader(file_path) | |
| documents = loader.load() | |
| # Bersihkan teks di setiap halaman sebelum di-split | |
| for doc in documents: | |
| doc.page_content = clean_text(doc.page_content) | |
| print(f"✅ Loaded {len(documents)} pages from {file_path}") | |
| return documents | |
| def split_documents(documents, chunk_size=1000, chunk_overlap=200): | |
| """ | |
| Split documents using RecursiveCharacterTextSplitter with explicit separators | |
| to ensure it respects paragraphs and sentences. | |
| """ | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| # Separators ini ngasih tau LangChain: | |
| # "Coba potong di paragraf dulu (\n\n), kalau kepanjangan potong di kalimat (.), baru per kata ( )" | |
| separators=["\n\n", "(?<=\. )", " ", ""] | |
| ) | |
| chunks = splitter.split_documents(documents) | |
| print(f"✂️ Split into {len(chunks)} chunks") | |
| return chunks | |
| # Test for the function | |
| if __name__ == "__main__": | |
| # Path-nya pastikan sesuai kalau mau ditest langsung | |
| docs = load_pdf("data/Md_Reja_E_Rabbi_Tonmoy.pdf") | |
| chunks = split_documents(docs) | |
| print("\n--- Contoh Chunk Pertama ---") | |
| print(chunks[0].page_content) |