Spaces:
Sleeping
Sleeping
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_core.documents import Document | |
| class ChunkingManager: | |
| def __init__(self): | |
| # PRO STRATEGY: | |
| # Chunk Size 1000: Large enough to capture context. | |
| # Overlap 200: Ensures we don't cut a sentence in half at the border. | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| def chunk_documents(self, documents: list[Document]): | |
| """ | |
| Splits large documents into smaller vector-ready chunks. | |
| INTELLIGENCE: Skips CSV rows (structured_data) as they are already perfect. | |
| """ | |
| print(f"✂️ Starting Chunking Process on {len(documents)} documents...") | |
| chunked_docs = [] | |
| skipped_docs = [] # CSVs and short image descriptions | |
| for doc in documents: | |
| category = doc.metadata.get("category", "") | |
| # CONDITION 1: Structured Data (CSV) -> DO NOT SPLIT | |
| # We already formatted these as single sentences. | |
| if category == "structured_data": | |
| skipped_docs.append(doc) | |
| continue | |
| # CONDITION 2: Text/PDF/Word -> SPLIT | |
| # These are pages that might be 3000 tokens long. | |
| splits = self.text_splitter.split_documents([doc]) | |
| chunked_docs.extend(splits) | |
| # Merge results | |
| total_docs = skipped_docs + chunked_docs | |
| print(f" - CSV Rows preserved: {len(skipped_docs)}") | |
| print(f" - Text Pages split into: {len(chunked_docs)} chunks") | |
| print(f"✅ Total Vector-Ready Chunks: {len(total_docs)}") | |
| return total_docs |