File size: 5,021 Bytes
3998131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
"""
Storage module for saving and loading processed chunks
"""

import json
import logging
from pathlib import Path
from typing import List, Dict, Any

from .models import DocumentChunk, ProcessingStats

logger = logging.getLogger(__name__)


class ChunkStorage:
    """Handles saving and loading of document chunks"""
    
    def __init__(self, output_file: Path):
        """
        Initialize storage
        
        Args:
            output_file: Path to output JSON file
        """
        self.output_file = output_file
        self.output_file.parent.mkdir(parents=True, exist_ok=True)
    
    def save_chunks(
        self,
        chunks: List[DocumentChunk],
        stats: ProcessingStats = None
    ) -> None:
        """
        Save chunks to JSON file
        
        Args:
            chunks: List of DocumentChunk objects
            stats: Optional processing statistics
        """
        logger.info(f"Saving {len(chunks)} chunks to {self.output_file}")
        
        # Convert chunks to dictionaries
        chunks_data = [chunk.to_dict() for chunk in chunks]
        
        # Prepare output structure
        output = {
            'metadata': {
                'total_chunks': len(chunks),
                'version': '1.0',
            },
            'chunks': chunks_data
        }
        
        # Add stats if provided
        if stats:
            output['metadata']['processing_stats'] = stats.to_dict()
        
        # Save to file with pretty formatting
        with open(self.output_file, 'w', encoding='utf-8') as f:
            json.dump(output, f, indent=2, ensure_ascii=False)
        
        logger.info(f"Successfully saved chunks to {self.output_file}")
        
        # Also save a summary file
        self._save_summary(chunks, stats)
    
    def _save_summary(
        self,
        chunks: List[DocumentChunk],
        stats: ProcessingStats = None
    ) -> None:
        """Save a human-readable summary"""
        summary_file = self.output_file.parent / "chunks_summary.txt"
        
        with open(summary_file, 'w', encoding='utf-8') as f:
            f.write("=" * 80 + "\n")
            f.write("DOCUMENT CHUNKS SUMMARY\n")
            f.write("=" * 80 + "\n\n")
            
            if stats:
                f.write(f"Total Documents Processed: {stats.total_documents}\n")
                f.write(f"Total Chunks Created: {stats.total_chunks}\n")
                f.write(f"Total Words: {stats.total_words}\n")
                f.write(f"Average Chunk Size: {stats.avg_chunk_size:.1f} words\n")
                f.write(f"Processing Time: {stats.processing_time_seconds:.2f} seconds\n")
                f.write(f"\nDocuments:\n")
                for doc in stats.documents_processed:
                    f.write(f"  - {doc}\n")
                f.write("\n")
            
            f.write("-" * 80 + "\n")
            f.write("SAMPLE CHUNKS (First 5)\n")
            f.write("-" * 80 + "\n\n")
            
            for i, chunk in enumerate(chunks[:5], 1):
                f.write(f"Chunk {i}: {chunk.chunk_id}\n")
                f.write(f"Source: {chunk.metadata.source_file}\n")
                f.write(f"Section: {chunk.metadata.article_section or 'N/A'}\n")
                f.write(f"Words: {chunk.metadata.word_count}\n")
                f.write(f"Preview: {chunk.text[:200]}...\n")
                f.write("\n" + "-" * 80 + "\n\n")
        
        logger.info(f"Summary saved to {summary_file}")
    
    def load_chunks(self) -> List[DocumentChunk]:
        """
        Load chunks from JSON file
        
        Returns:
            List of DocumentChunk objects
        """
        logger.info(f"Loading chunks from {self.output_file}")
        
        if not self.output_file.exists():
            raise FileNotFoundError(f"Chunks file not found: {self.output_file}")
        
        with open(self.output_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        chunks = [DocumentChunk.from_dict(chunk_data) for chunk_data in data['chunks']]
        
        logger.info(f"Loaded {len(chunks)} chunks")
        
        return chunks
    
    def validate_chunks(self, chunks: List[DocumentChunk]) -> bool:
        """
        Validate chunks before saving
        
        Args:
            chunks: List of chunks to validate
            
        Returns:
            True if valid, raises exception otherwise
        """
        if not chunks:
            raise ValueError("No chunks to save")
        
        for i, chunk in enumerate(chunks):
            if not chunk.text or not chunk.text.strip():
                raise ValueError(f"Chunk {i} has empty text")
            
            if not chunk.chunk_id:
                raise ValueError(f"Chunk {i} has no ID")
            
            if chunk.metadata.word_count == 0:
                raise ValueError(f"Chunk {i} has zero word count")
        
        logger.info(f"Validated {len(chunks)} chunks successfully")
        return True