Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| from langchain_community.document_loaders import PyMuPDFLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from src.config.settings import settings | |
| from src.utils.logger import get_logger | |
| logger = get_logger(__name__) | |
| class DocumentLoader: | |
| def load_pdfs(self): | |
| data_path = Path(settings.raw_data_dir) | |
| pdf_files = list(data_path.glob("*.pdf")) | |
| if not pdf_files: | |
| logger.warning("No PDFs found in data/raw") | |
| return [] | |
| documents = [] | |
| for pdf in pdf_files: | |
| logger.info(f"Loading PDF: {pdf.name}") | |
| loader = PyMuPDFLoader(str(pdf)) | |
| pages = loader.load() | |
| clean_pages = [] | |
| for page in pages: | |
| text = page.page_content.strip().lower() | |
| # 🚫 remove index pages | |
| if "index" in text[:200]: | |
| continue | |
| # 🚫 remove table of contents | |
| if "chapter" in text and "...." in text: | |
| continue | |
| # 🚫 remove glossary-style alphabetical lists | |
| if text.count(",") > 20 and len(text) < 1500: | |
| continue | |
| clean_pages.append(page) | |
| logger.info(f"Kept {len(clean_pages)} useful pages.") | |
| documents.extend(clean_pages) | |
| logger.info(f"Total kept pages: {len(documents)}") | |
| return documents | |
| def split_documents(self, documents): | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=800, | |
| chunk_overlap=150 | |
| ) | |
| chunks = text_splitter.split_documents(documents) | |
| filtered_chunks = [] | |
| for chunk in chunks: | |
| text = chunk.page_content.strip() | |
| # Remove very short chunks | |
| if len(text) < 200: | |
| continue | |
| # Remove index/table-of-contents style chunks | |
| if text.count(".....") > 2: | |
| continue | |
| filtered_chunks.append(chunk) | |
| logger.info(f"Split into {len(filtered_chunks)} clean chunks.") | |
| return filtered_chunks | |