File size: 1,910 Bytes
633bb91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import hashlib
from typing import List
from config import Config
from utils.normalizer import Normalizer
from langchain_core.documents import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter


class DocumentChunker:
    def __init__(self):
        self.splitter = RecursiveCharacterTextSplitter(
            chunk_size=Config.CHUNK_SIZE,
            chunk_overlap=Config.CHUNK_OVERLAP
        )
        self.existing_hashes = set()
        self.normalizer = Normalizer()

    def hash_text(self, text: str) -> str:
        return hashlib.md5(text.encode('utf-8')).hexdigest()

    def split_documents(self, docs: List[Document]) -> List[dict]:
        """Split and deduplicate documents. Returns list of dicts with id, text, metadata."""
        chunks = self.splitter.split_documents(docs)
        results = []

        for i, chunk in enumerate(chunks):
            normalized_text = self.normalizer.normalize_text(chunk.page_content)
            if not normalized_text:
                continue
            chunk_hash = self.hash_text(normalized_text)
            if chunk_hash in self.existing_hashes:
                continue
            self.existing_hashes.add(chunk_hash)

            results.append({
                "id": int(chunk_hash, 16) % (10 ** 9),
                "text": normalized_text,
                "metadata": {
                    **chunk.metadata,
                    "chunk_order": i  # Preserve order
                }
            })

        return results


if __name__ == "__main__":

    sample_docs = [
        Document(
            page_content="This is a long document that needs to be split into smaller pieces.",
            metadata={"source": "example.txt"}
        )
    ]

    chunker = DocumentChunker()
    chunks = chunker.split_documents(sample_docs)

    for i, cnk in enumerate(chunks):
        print(f"#### Chunk {i}: {cnk['text']}")