File size: 6,821 Bytes
c0f31c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
"""

HierarchicalChunker.py



A module for hierarchical document chunking that combines page-level and semantic chunking.



Features:

- Multi-level document representation (pages and chunks)

- Semantic chunking with sentence boundaries

- Size and overlap controls

- Hierarchical metadata

"""

import logging
import spacy
from typing import Dict, List, Optional, Any
from langchain_core.documents import Document
from core.PageChunker import PageChunker

logger = logging.getLogger(__name__)

class HierarchicalChunker(PageChunker):
    """Handles document chunking at multiple hierarchical levels."""
    
    def __init__(

        self,

        model_name: Optional[str] = None,

        embedding_model: Optional[Any] = None,

        chunk_size: int = 500,

        chunk_overlap: int = 50,

        similarity_threshold: float = 0.85

    ):
        """

        Initialize hierarchical chunker with specified models and parameters.

        

        Args:

            model_name: Name of the model for tokenization

            embedding_model: Model for generating embeddings

            chunk_size: Maximum size of semantic chunks

            chunk_overlap: Overlap between chunks

            similarity_threshold: Similarity threshold for merging chunks

        """
        super().__init__(model_name, embedding_model)
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.similarity_threshold = similarity_threshold
        
        # Initialize spaCy for NLP tasks
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except OSError:
            logger.info("Installing spaCy model...")
            import subprocess
            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], 
                         capture_output=True)
            self.nlp = spacy.load("en_core_web_sm")

    def _create_semantic_chunks(self, content: str, page_number: int) -> List[Document]:
        """

        Create semantic chunks with detailed metadata.

        

        Args:

            content: The page content to chunk

            page_number: The page number

            

        Returns:

            List of Document objects representing semantic chunks

        """
        if not content.strip():
            return []
            
        sentences = list(self.nlp(content).sents)
        chunks = []
        current_chunk = []
        current_length = 0

        for sent in sentences:
            sent_text = sent.text.strip()
            sent_length = len(sent_text)

            if current_length + sent_length > self.chunk_size:
                if current_chunk:
                    chunk_text = " ".join(current_chunk)
                    stats = self.analyze_text(chunk_text)
                    chunks.append(Document(
                        page_content=chunk_text,
                        metadata={
                            "level": "chunk",
                            "page_num": page_number,
                            "chunk_num": len(chunks) + 1,
                            "parent_page": page_number,
                            "char_count": stats["char_count"],
                            "token_count": stats["token_count"],
                            "sentence_count": stats["sentence_count"],
                            "word_count": stats["word_count"],
                            "has_ocr": stats.get("has_content", "true")
                        }
                    ))
                current_chunk = [sent_text]
                current_length = sent_length
            else:
                current_chunk.append(sent_text)
                current_length += sent_length

        # Handle final chunk
        if current_chunk:
            chunk_text = " ".join(current_chunk)
            stats = self.analyze_text(chunk_text)
            chunks.append(Document(
                page_content=chunk_text,
                metadata={
                    "level": "chunk",
                    "page_num": page_number,
                    "chunk_num": len(chunks) + 1,
                    "parent_page": page_number,
                    "char_count": stats["char_count"],
                    "token_count": stats["token_count"],
                    "sentence_count": stats["sentence_count"],
                    "word_count": stats["word_count"],
                    "has_ocr": stats.get("has_content", "true")
                }
            ))
        
        self.page_stats.append(f"Created {len(chunks)} chunks for page {page_number}")
        return chunks

    def hierarchical_process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
        """

        Process document with hierarchical chunking strategy.

        

        Args:

            file_path: Path to the PDF file

            preprocess: Whether to preprocess text

            

        Returns:

            Dictionary with 'pages' and 'chunks' lists of Documents

        """
        self.page_stats = []  # Reset stats
        
        # First get the page-level documents using PageChunker
        page_docs = super().page_process_document(file_path, preprocess)
        
        # Now create chunk-level documents
        chunk_docs = []
        total_chunks = 0
        
        for page_doc in page_docs:
            page_num = page_doc.metadata["page"]
            
            # Mark this as a page-level document
            page_doc.metadata["level"] = "page"
            
            # Create chunks for this page
            page_chunks = self._create_semantic_chunks(
                page_doc.page_content, 
                page_num
            )
            
            chunk_docs.extend(page_chunks)
            total_chunks += len(page_chunks)
        
        # Log summary information
        logger.info(f"\nHierarchical Processing Summary:")
        logger.info(f"Total Pages: {len(page_docs)}")
        logger.info(f"Total Chunks: {total_chunks}")
        logger.info("\n".join(self.page_stats))
        
        return {
            "pages": page_docs,
            "chunks": chunk_docs
        }
        
    def process_document(self, file_path: str, preprocess: bool = True) -> Dict[str, List[Document]]:
        """

        Process document using hierarchical chunking strategy (implements abstract method).

        

        Args:

            file_path: Path to the PDF file

            preprocess: Whether to preprocess text

            

        Returns:

            Dictionary with 'pages' and 'chunks' lists of Documents

        """
        return self.hierarchical_process_document(file_path, preprocess)