Spaces:
Running
Running
| import os | |
| from pathlib import Path | |
| from typing import List, Optional | |
| from langchain_community.document_loaders import PyPDFLoader, ArxivLoader | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain.schema import Document | |
| from project.logger.logging import get_logger | |
| logger = get_logger(__name__) | |
| class DataPreparation: | |
| def __init__( | |
| self, | |
| data_dir: str = "data", | |
| chunk_size: int = 1000, | |
| chunk_overlap: int = 200 | |
| ): | |
| self.data_dir = Path(data_dir) | |
| self.data_dir.mkdir(exist_ok=True) | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| length_function=len, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| logger.info(f"DataPreparation initialized with chunk_size={chunk_size}") | |
| def load_attention_paper(self, arxiv_id: str = "1706.03762") -> List[Document]: | |
| pdf_path = self.data_dir / "attention-is-all-you-need.pdf" | |
| if pdf_path.exists(): | |
| logger.info(f"Loading PDF from local file: {pdf_path}") | |
| return self._load_pdf(str(pdf_path)) | |
| logger.info(f"PDF not found locally. Downloading from ArXiv: {arxiv_id}") | |
| try: | |
| loader = ArxivLoader(query=arxiv_id, load_max_docs=1) | |
| documents = loader.load() | |
| if documents: | |
| logger.info(f"Successfully downloaded paper from ArXiv") | |
| return documents | |
| else: | |
| raise ValueError("No documents returned from ArXiv") | |
| except Exception as e: | |
| logger.error(f"Failed to download from ArXiv: {str(e)}") | |
| raise | |
| def _load_pdf(self, pdf_path: str) -> List[Document]: | |
| try: | |
| loader = PyPDFLoader(pdf_path) | |
| documents = loader.load() | |
| logger.info(f"Loaded {len(documents)} pages from PDF") | |
| return documents | |
| except Exception as e: | |
| logger.error(f"Failed to load PDF: {str(e)}") | |
| raise | |
| def load_custom_pdf(self, pdf_path: str) -> List[Document]: | |
| if not Path(pdf_path).exists(): | |
| raise FileNotFoundError(f"PDF not found: {pdf_path}") | |
| return self._load_pdf(pdf_path) | |
| def split_documents(self, documents: List[Document]) -> List[Document]: | |
| try: | |
| chunks = self.text_splitter.split_documents(documents) | |
| logger.info(f"Split documents into {len(chunks)} chunks") | |
| return chunks | |
| except Exception as e: | |
| logger.error(f"Failed to split documents: {str(e)}") | |
| raise | |
| def prepare_documents( | |
| self, | |
| pdf_path: Optional[str] = None, | |
| use_attention_paper: bool = True | |
| ) -> List[Document]: | |
| try: | |
| if pdf_path: | |
| documents = self.load_custom_pdf(pdf_path) | |
| elif use_attention_paper: | |
| documents = self.load_attention_paper() | |
| else: | |
| raise ValueError("Either provide pdf_path or set use_attention_paper=True") | |
| chunks = self.split_documents(documents) | |
| logger.info(f"Document preparation complete: {len(chunks)} chunks ready") | |
| return chunks | |
| except Exception as e: | |
| logger.error(f"Document preparation failed: {str(e)}") | |
| raise | |