ai-knowledge-assistant / app /services /document_processor.py
Hamza4100's picture
Upload 20 files
a631409 verified
"""Document processing service for PDF, CSV, and Markdown files."""
import os
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
PyPDFLoader,
CSVLoader,
TextLoader,
)
from langchain.schema import Document
from app.config import settings
class DocumentProcessor:
"""Handles loading and chunking of various document types."""
def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=settings.CHUNK_SIZE,
chunk_overlap=settings.CHUNK_OVERLAP,
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""],
)
self.supported_extensions = {
".pdf": self._load_pdf,
".csv": self._load_csv,
".md": self._load_markdown,
".txt": self._load_text,
}
def get_supported_extensions(self) -> List[str]:
return list(self.supported_extensions.keys())
async def process_file(
self, file_path: str, metadata: Optional[dict] = None
) -> List[Document]:
"""Process a file and return chunked documents with metadata."""
ext = os.path.splitext(file_path)[1].lower()
if ext not in self.supported_extensions:
raise ValueError(
f"Unsupported file type: {ext}. "
f"Supported: {', '.join(self.supported_extensions.keys())}"
)
# Load documents
loader_fn = self.supported_extensions[ext]
documents = loader_fn(file_path)
# Add custom metadata
filename = os.path.basename(file_path)
for doc in documents:
doc.metadata["source"] = filename
doc.metadata["file_type"] = ext
if metadata:
doc.metadata.update(metadata)
# Split into chunks
chunks = self.text_splitter.split_documents(documents)
# Add chunk IDs
for i, chunk in enumerate(chunks):
chunk.metadata["chunk_id"] = f"{filename}_chunk_{i}"
chunk.metadata["chunk_index"] = i
chunk.metadata["total_chunks"] = len(chunks)
return chunks
def _load_pdf(self, file_path: str) -> List[Document]:
"""Load PDF file."""
loader = PyPDFLoader(file_path)
return loader.load()
def _load_csv(self, file_path: str) -> List[Document]:
"""Load CSV file."""
loader = CSVLoader(file_path, encoding="utf-8")
return loader.load()
def _load_markdown(self, file_path: str) -> List[Document]:
"""Load Markdown file."""
loader = TextLoader(file_path, encoding="utf-8")
return loader.load()
def _load_text(self, file_path: str) -> List[Document]:
"""Load plain text file."""
loader = TextLoader(file_path, encoding="utf-8")
return loader.load()
document_processor = DocumentProcessor()