File size: 1,996 Bytes
cd6f412
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import logging
import json
from typing import List, Dict, Any
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

logger = logging.getLogger(__name__)

def chunk_text(text: str, source_metadata: Dict[str, Any]) -> List[Document]:
    """
    Chunks the given text and attaches rich metadata to each chunk.

    Args:
        text: The full text content to be chunked.
        source_metadata: A dictionary containing metadata about the source document
                         (e.g., id, url, local_path).

    Returns:
        A list of LangChain Document objects, each representing a chunk.
    """
    if not text:
        logger.warning(f"Received empty text for source_id {source_metadata.get('id')}. No chunks created.")
        return []
        
    # Using RecursiveCharacterTextSplitter as it's robust for general text.
    # These parameters can be tuned based on embedding model's context window and performance.
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
        separators=["\n\n", "\n", ". ", " ", ""],
    )
    
    split_texts = text_splitter.split_text(text)
    
    documents = []
    for i, chunk_text in enumerate(split_texts):
        # This metadata is crucial for the Fairness Agent and for filtering.
        chunk_metadata = {
            "source_id": source_metadata.get("id"),
            "source_url": source_metadata.get("url"),
            "source_name": source_metadata.get("name"),
            "source_local_path": source_metadata.get("local_path"),
            "chunk_number": i + 1,
            "total_chunks": len(split_texts)
        }
        
        doc = Document(page_content=chunk_text, metadata=chunk_metadata)
        documents.append(doc)
        
    logger.info(f"Created {len(documents)} chunks for source_id {source_metadata.get('id')}")
    return documents