Spaces:
Sleeping
Sleeping
| """Document processing service for PDF, CSV, and Markdown files.""" | |
| import os | |
| from typing import List, Optional | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import ( | |
| PyPDFLoader, | |
| CSVLoader, | |
| TextLoader, | |
| ) | |
| from langchain.schema import Document | |
| from app.config import settings | |
| class DocumentProcessor: | |
| """Handles loading and chunking of various document types.""" | |
| def __init__(self): | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=settings.CHUNK_SIZE, | |
| chunk_overlap=settings.CHUNK_OVERLAP, | |
| length_function=len, | |
| separators=["\n\n", "\n", ". ", " ", ""], | |
| ) | |
| self.supported_extensions = { | |
| ".pdf": self._load_pdf, | |
| ".csv": self._load_csv, | |
| ".md": self._load_markdown, | |
| ".txt": self._load_text, | |
| } | |
| def get_supported_extensions(self) -> List[str]: | |
| return list(self.supported_extensions.keys()) | |
| async def process_file( | |
| self, file_path: str, metadata: Optional[dict] = None | |
| ) -> List[Document]: | |
| """Process a file and return chunked documents with metadata.""" | |
| ext = os.path.splitext(file_path)[1].lower() | |
| if ext not in self.supported_extensions: | |
| raise ValueError( | |
| f"Unsupported file type: {ext}. " | |
| f"Supported: {', '.join(self.supported_extensions.keys())}" | |
| ) | |
| # Load documents | |
| loader_fn = self.supported_extensions[ext] | |
| documents = loader_fn(file_path) | |
| # Add custom metadata | |
| filename = os.path.basename(file_path) | |
| for doc in documents: | |
| doc.metadata["source"] = filename | |
| doc.metadata["file_type"] = ext | |
| if metadata: | |
| doc.metadata.update(metadata) | |
| # Split into chunks | |
| chunks = self.text_splitter.split_documents(documents) | |
| # Add chunk IDs | |
| for i, chunk in enumerate(chunks): | |
| chunk.metadata["chunk_id"] = f"{filename}_chunk_{i}" | |
| chunk.metadata["chunk_index"] = i | |
| chunk.metadata["total_chunks"] = len(chunks) | |
| return chunks | |
| def _load_pdf(self, file_path: str) -> List[Document]: | |
| """Load PDF file.""" | |
| loader = PyPDFLoader(file_path) | |
| return loader.load() | |
| def _load_csv(self, file_path: str) -> List[Document]: | |
| """Load CSV file.""" | |
| loader = CSVLoader(file_path, encoding="utf-8") | |
| return loader.load() | |
| def _load_markdown(self, file_path: str) -> List[Document]: | |
| """Load Markdown file.""" | |
| loader = TextLoader(file_path, encoding="utf-8") | |
| return loader.load() | |
| def _load_text(self, file_path: str) -> List[Document]: | |
| """Load plain text file.""" | |
| loader = TextLoader(file_path, encoding="utf-8") | |
| return loader.load() | |
| document_processor = DocumentProcessor() | |