Spaces:
Sleeping
Sleeping
File size: 3,062 Bytes
a631409 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | """Document processing service for PDF, CSV, and Markdown files."""
import os
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
PyPDFLoader,
CSVLoader,
TextLoader,
)
from langchain.schema import Document
from app.config import settings
class DocumentProcessor:
"""Handles loading and chunking of various document types."""
def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=settings.CHUNK_SIZE,
chunk_overlap=settings.CHUNK_OVERLAP,
length_function=len,
separators=["\n\n", "\n", ". ", " ", ""],
)
self.supported_extensions = {
".pdf": self._load_pdf,
".csv": self._load_csv,
".md": self._load_markdown,
".txt": self._load_text,
}
def get_supported_extensions(self) -> List[str]:
return list(self.supported_extensions.keys())
async def process_file(
self, file_path: str, metadata: Optional[dict] = None
) -> List[Document]:
"""Process a file and return chunked documents with metadata."""
ext = os.path.splitext(file_path)[1].lower()
if ext not in self.supported_extensions:
raise ValueError(
f"Unsupported file type: {ext}. "
f"Supported: {', '.join(self.supported_extensions.keys())}"
)
# Load documents
loader_fn = self.supported_extensions[ext]
documents = loader_fn(file_path)
# Add custom metadata
filename = os.path.basename(file_path)
for doc in documents:
doc.metadata["source"] = filename
doc.metadata["file_type"] = ext
if metadata:
doc.metadata.update(metadata)
# Split into chunks
chunks = self.text_splitter.split_documents(documents)
# Add chunk IDs
for i, chunk in enumerate(chunks):
chunk.metadata["chunk_id"] = f"{filename}_chunk_{i}"
chunk.metadata["chunk_index"] = i
chunk.metadata["total_chunks"] = len(chunks)
return chunks
def _load_pdf(self, file_path: str) -> List[Document]:
"""Load PDF file."""
loader = PyPDFLoader(file_path)
return loader.load()
def _load_csv(self, file_path: str) -> List[Document]:
"""Load CSV file."""
loader = CSVLoader(file_path, encoding="utf-8")
return loader.load()
def _load_markdown(self, file_path: str) -> List[Document]:
"""Load Markdown file."""
loader = TextLoader(file_path, encoding="utf-8")
return loader.load()
def _load_text(self, file_path: str) -> List[Document]:
"""Load plain text file."""
loader = TextLoader(file_path, encoding="utf-8")
return loader.load()
document_processor = DocumentProcessor()
|