File size: 12,371 Bytes
c0f31c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
"""

ParagraphChunker.py



A module for paragraph-level document chunking with token counting and preprocessing.



Features:

- Paragraph-based document splitting

- Content validation

- Multi-level delimiter detection

- Smart paragraph boundary detection

"""

import logging
import spacy
from typing import List, Optional
from pathlib import Path
from datetime import datetime
from langchain_core.documents import Document
from core.BaseChunker import BaseChunker

logger = logging.getLogger(__name__)

class ParagraphChunker(BaseChunker):
    """Handles document chunking at the paragraph level with token counting."""
    
    PARAGRAPH_MIN_LENGTH = 50  # Minimum characters for a valid paragraph
    
    def __init__(self, model_name=None, embedding_model=None):
        """

        Initialize paragraph chunker with specified models.

        

        Args:

            model_name: Name of the model for tokenization

            embedding_model: Model for generating embeddings

        """
        super().__init__(model_name, embedding_model)
        self.page_stats = []
        
        # Initialize spaCy for NLP tasks
        try:
            self.nlp = spacy.load("en_core_web_sm")
        except Exception as e:
            logger.error(f"Error loading spaCy model: {e}")
            import subprocess
            logger.info("Installing spaCy model...")
            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"], 
                         capture_output=True)
            self.nlp = spacy.load("en_core_web_sm")
        
    def _split_into_paragraphs(self, text: str) -> List[str]:
        """

        Split text into paragraphs using length and punctuation heuristics.

        

        Args:

            text: The text content to split

            

        Returns:

            List of paragraphs

        """
        # Pre-clean the text
        text = text.replace('\r', '\n')
        
        # First, try double line breaks
        paragraphs = text.split('\n\n')
        
        # If that fails (PDF extraction issue), use sentence-based reconstruction
        if len(paragraphs) <= 3:
            print(f"PDF extraction flattened structure. Reconstructing from sentences...")
            
            # Use spaCy for sentence detection
            doc = self.nlp(text)
            paragraphs = []
            current_para = []
            current_length = 0
            
            for sent in doc.sents:
                sent_text = sent.text.strip()
                if not sent_text:
                    continue
                
                # Add sentence to current paragraph
                current_para.append(sent_text)
                current_length += len(sent_text)
                
                # Check if we should end the current paragraph
                should_end_paragraph = (
                    # Paragraph is getting long (300-600 chars is typical)
                    current_length > 300 and
                    # Current sentence ends with proper punctuation
                    sent_text.endswith(('.', '!', '?')) and
                    # We have substantial content
                    len(current_para) >= 2
                )
                
                if should_end_paragraph:
                    paragraphs.append(' '.join(current_para))
                    current_para = []
                    current_length = 0
            
            # Add the last paragraph
            if current_para:
                paragraphs.append(' '.join(current_para))
            
            print(f"Reconstructed {len(paragraphs)} paragraphs using length heuristics")
        
        # Clean and filter paragraphs
        cleaned_paragraphs = []
        for para in paragraphs:
            clean_para = ' '.join(para.split())
            if len(clean_para) >= self.PARAGRAPH_MIN_LENGTH:
                cleaned_paragraphs.append(clean_para)
        
        print(f"Final paragraph count: {len(cleaned_paragraphs)}")
        return cleaned_paragraphs

    def _process_single_paragraph(self, content: str, page_number: int, 

                                 para_number: int, preprocess: bool) -> Optional[Document]:
        """

        Process a single paragraph with analysis and metadata.

        

        Args:

            content: The paragraph content

            page_number: The page number

            para_number: The paragraph number

            preprocess: Whether to preprocess the text

            

        Returns:

            Document object with processed content and metadata, or None if paragraph is invalid

        """
        # First check character length
        if len(content.strip()) < self.PARAGRAPH_MIN_LENGTH:
            self.page_stats.append(f"Paragraph {para_number} on page {page_number} is too short.")
            return None
            
        # Optionally preprocess the text
        if preprocess:
            content = self.preprocess_text(content)
            
        # Analyze the paragraph and generate metadata
        stats = self.analyze_text(content)
        
        # Check token threshold
        if stats["token_count"] < self.TOKEN_THRESHOLD:
            self.page_stats.append(
                f"Paragraph {para_number} on page {page_number} dropped: "
                f"only {stats['token_count']} tokens"
            )
            return None
            
        metadata = {
            "page": page_number,
            "paragraph": para_number,
            "char_count": stats["char_count"],
            "token_count": stats["token_count"],
            "sentence_count": stats["sentence_count"],
            "word_count": stats["word_count"],
            "has_ocr": str(stats.get("has_content", True))
        }
        
        return Document(page_content=content, metadata=metadata)

    def paragraph_process_document(self, file_path: str, preprocess: bool = False) -> List[Document]:
        """

        Process PDF document paragraph by paragraph with analysis.

        

        Args:

            file_path: Path to the PDF file

            preprocess: Whether to preprocess paragraph text

            

        Returns:

            List of Document objects, one per valid paragraph

        """
        try:
            self.page_stats = []  # Reset stats for this document
            raw_pages = self.load_document(file_path)
            processed_paragraphs = []
            
            logger.info(f"Processing document with {len(raw_pages)} pages")
            
            for page_idx, page in enumerate(raw_pages):
                paragraphs = self._split_into_paragraphs(page.page_content)
                logger.info(f"Page {page_idx+1}: Found {len(paragraphs)} paragraphs")
                
                for para_idx, paragraph in enumerate(paragraphs):
                    processed_para = self._process_single_paragraph(
                        paragraph, 
                        page_idx + 1, 
                        para_idx + 1, 
                        preprocess
                    )
                    if processed_para:
                        processed_paragraphs.append(processed_para)
                        
            # Output skipped paragraphs for transparency
            if self.page_stats:
                logger.info("\n".join(self.page_stats))
                
            logger.info(f"Processed {len(processed_paragraphs)} valid paragraphs")
            return processed_paragraphs
            
        except Exception as e:
            logger.error(f"Error in paragraph_process_document: {e}")
            raise
    
    def process_document(self, file_path: str, preprocess: bool = True) -> List[Document]:
        """

        Process document using paragraph chunking strategy (implements abstract method).

        

        Args:

            file_path: Path to the PDF file

            preprocess: Whether to preprocess paragraph text

            

        Returns:

            List of Document objects, one per valid paragraph

        """
        return self.paragraph_process_document(file_path, preprocess)
    
    def process_text_file(self, file_path: str, preprocess: bool = False) -> List[Document]:
        """

        Process text file directly, preserving paragraph structure.

        

        Args:

            file_path: Path to the text file

            preprocess: Whether to preprocess paragraph text

            

        Returns:

            List of Document objects, one per valid paragraph

        """
        try:
            # Load the text file directly
            content = self.load_text_file(file_path)
            
            # Clean the text using the same logic as PDF conversion
            content = self.clean_text_for_processing(content)
            
            # Split into paragraphs using double line breaks
            paragraphs = content.split('\n\n')
            
            logger.info(f"Found {len(paragraphs)} paragraphs in text file: {file_path}")
            
            processed_paragraphs = []
            file_name = Path(file_path).name
            
            for para_idx, paragraph in enumerate(paragraphs):
                paragraph = paragraph.strip()
                if paragraph:
                    processed_para = self._process_single_paragraph_from_text(
                        paragraph, 
                        file_path,
                        file_name,
                        para_idx + 1, 
                        preprocess
                    )
                    if processed_para:
                        processed_paragraphs.append(processed_para)
            
            logger.info(f"Processed {len(processed_paragraphs)} valid paragraphs from text file")
            return processed_paragraphs
            
        except Exception as e:
            logger.error(f"Error processing text file: {e}")
            raise

    def _process_single_paragraph_from_text(self, content: str, file_path: str, 

                                        file_name: str, para_number: int, 

                                        preprocess: bool) -> Optional[Document]:
        """

        Process a single paragraph from text file with analysis and metadata.

        

        Args:

            content: The paragraph content

            file_path: Full path to the source file

            file_name: Name of the source file

            para_number: The paragraph number

            preprocess: Whether to preprocess the text

            

        Returns:

            Document object with processed content and metadata, or None if paragraph is invalid

        """
        # First check character length
        if len(content.strip()) < self.PARAGRAPH_MIN_LENGTH:
            logger.debug(f"Paragraph {para_number} too short ({len(content)} chars), skipping")
            return None
        
        # Preprocess if requested
        if preprocess:
            content = self.preprocess_text(content, remove_headers_footers=False)
        
        # Analyze the paragraph
        analysis = self.analyze_text(content)
        
        # Validate content quality
        if not self.is_content_valid(content):
            logger.debug(f"Paragraph {para_number} failed content validation, skipping")
            return None
        
        # Create metadata
        metadata = {
            "source": file_path,
            "file_name": file_name,
            "file_type": "txt",
            "paragraph": para_number,
            "char_count": analysis["char_count"],
            "token_count": analysis["token_count"],
            "sentence_count": analysis["sentence_count"],
            "word_count": analysis["word_count"],
            "chunk_type": "paragraph",
            "processing_timestamp": datetime.now().isoformat(),
        }
        
        # Create and return document
        doc = Document(page_content=content, metadata=metadata)
        logger.debug(f"Created paragraph {para_number}: {analysis['char_count']} chars, {analysis['token_count']} tokens")
        
        return doc