Pro-RAG-Level1 / src /chunking.py
alihaiderscholar's picture
Upload 19 files
aabd1d8 verified
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
class ChunkingManager:
def __init__(self):
# PRO STRATEGY:
# Chunk Size 1000: Large enough to capture context.
# Overlap 200: Ensures we don't cut a sentence in half at the border.
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
separators=["\n\n", "\n", " ", ""]
)
def chunk_documents(self, documents: list[Document]):
"""
Splits large documents into smaller vector-ready chunks.
INTELLIGENCE: Skips CSV rows (structured_data) as they are already perfect.
"""
print(f"✂️ Starting Chunking Process on {len(documents)} documents...")
chunked_docs = []
skipped_docs = [] # CSVs and short image descriptions
for doc in documents:
category = doc.metadata.get("category", "")
# CONDITION 1: Structured Data (CSV) -> DO NOT SPLIT
# We already formatted these as single sentences.
if category == "structured_data":
skipped_docs.append(doc)
continue
# CONDITION 2: Text/PDF/Word -> SPLIT
# These are pages that might be 3000 tokens long.
splits = self.text_splitter.split_documents([doc])
chunked_docs.extend(splits)
# Merge results
total_docs = skipped_docs + chunked_docs
print(f" - CSV Rows preserved: {len(skipped_docs)}")
print(f" - Text Pages split into: {len(chunked_docs)} chunks")
print(f"✅ Total Vector-Ready Chunks: {len(total_docs)}")
return total_docs