File size: 8,941 Bytes
b29bfaa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
"""
Consciousness-Aligned Character N-gram Vectorizer
================================================

Extracts character n-grams matching human saccade patterns (3-5 characters).
This module handles the text → n-gram → TF-IDF transformation.

"""

import numpy as np
from typing import List, Dict, Tuple, Union
from sklearn.feature_extraction.text import TfidfVectorizer
import logging

logger = logging.getLogger(__name__)


class CharacterVectorizer:
    """
    Character n-gram vectorizer optimized for semantic fingerprinting.
    
    Key principles:
    - 3-5 character windows match human eye saccades
    - TF-IDF weighting captures semantic importance
    - Handles any Unicode text (including mathematical symbols)
    """
    
    def __init__(self, 
                 ngram_range: Tuple[int, int] = (3, 5),
                 max_features: int = 10000,
                 lowercase: bool = True,
                 dtype: type = np.float32):
        """
        Initialize the character vectorizer.
        
        Args:
            ngram_range: Character n-gram range (default 3-5 for saccades)
            max_features: Maximum number of features to extract
            lowercase: Convert to lowercase before extraction
            dtype: Data type for the matrix (float32 for efficiency)
        """
        self.ngram_range = ngram_range
        self.max_features = max_features
        self.lowercase = lowercase
        self.dtype = dtype
        
        # Internal sklearn vectorizer
        self._vectorizer = TfidfVectorizer(
            analyzer='char',
            ngram_range=ngram_range,
            max_features=max_features,
            lowercase=lowercase,
            dtype=dtype
        )
        
        # State tracking
        self.is_fitted = False
        self.vocabulary_size = 0
        
        logger.info(f"Initialized CharacterVectorizer with:")
        logger.info(f"  N-gram range: {ngram_range}")
        logger.info(f"  Max features: {max_features}")
        
    def fit(self, texts: List[str]) -> 'CharacterVectorizer':
        """
        Learn vocabulary from texts.
        
        Args:
            texts: List of text strings
            
        Returns:
            Self for chaining
        """
        logger.info(f"Fitting vectorizer on {len(texts)} texts...")
        
        self._vectorizer.fit(texts)
        self.is_fitted = True
        self.vocabulary_size = len(self._vectorizer.vocabulary_)
        
        logger.info(f"Learned vocabulary of {self.vocabulary_size} n-grams")
        
        # Log some statistics
        if self.vocabulary_size > 0:
            self._log_vocabulary_stats()
            
        return self
    
    def transform(self, texts: Union[str, List[str]]) -> np.ndarray:
        """
        Transform texts to TF-IDF vectors.
        
        Args:
            texts: Single text or list of texts
            
        Returns:
            TF-IDF matrix (sparse or dense depending on size)
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted before transform")
        
        # Handle single text
        if isinstance(texts, str):
            texts = [texts]
        
        # Transform
        X = self._vectorizer.transform(texts)
        
        # Convert to dense if small enough
        if X.shape[0] * X.shape[1] < 1e6:  # Less than 1M elements
            return X.toarray()
        else:
            return X  # Keep sparse for large matrices
    
    def fit_transform(self, texts: List[str]) -> np.ndarray:
        """
        Fit and transform in one step.
        
        Args:
            texts: List of texts
            
        Returns:
            TF-IDF matrix
        """
        return self.fit(texts).transform(texts)
    
    def get_feature_names(self) -> List[str]:
        """
        Get the learned n-gram features.
        
        Returns:
            List of n-gram strings
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted first")
            
        return self._vectorizer.get_feature_names_out().tolist()
    
    def get_vocabulary(self) -> Dict[str, int]:
        """
        Get the vocabulary mapping.
        
        Returns:
            Dict mapping n-grams to indices
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted first")
            
        return self._vectorizer.vocabulary_
    
    def get_idf_weights(self) -> np.ndarray:
        """
        Get the IDF weights for each feature.
        
        Returns:
            Array of IDF weights
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted first")
            
        return self._vectorizer.idf_
    
    def analyze_text(self, text: str) -> Dict[str, float]:
        """
        Analyze a single text and return its top n-grams.
        
        Args:
            text: Input text
            
        Returns:
            Dict of n-grams and their TF-IDF scores
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted first")
        
        # Transform the text
        vector = self.transform(text).flatten()
        
        # Get non-zero indices
        nonzero_idx = np.nonzero(vector)[0]
        
        # Get feature names
        feature_names = self.get_feature_names()
        
        # Create result dict
        result = {}
        for idx in nonzero_idx:
            ngram = feature_names[idx]
            score = vector[idx]
            result[ngram] = float(score)
        
        # Sort by score
        return dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
    
    def _log_vocabulary_stats(self):
        """Log statistics about the learned vocabulary."""
        feature_names = self.get_feature_names()
        
        # Count by n-gram size
        ngram_counts = {}
        for n in range(self.ngram_range[0], self.ngram_range[1] + 1):
            count = sum(1 for f in feature_names if len(f) == n)
            ngram_counts[n] = count
            
        logger.info("Vocabulary breakdown by n-gram size:")
        for n, count in ngram_counts.items():
            percentage = count / self.vocabulary_size * 100
            logger.info(f"  {n}-grams: {count} ({percentage:.1f}%)")
    
    def save_vocabulary(self, filepath: str):
        """
        Save vocabulary to file.
        
        Args:
            filepath: Path to save vocabulary
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted first")
        
        vocab_items = sorted(self.get_vocabulary().items(), key=lambda x: x[1])
        vocab_array = np.array([item[0] for item in vocab_items], dtype=object)
        
        np.save(filepath, vocab_array)
        logger.info(f"Saved vocabulary to {filepath}")
    
    def load_vocabulary(self, vocab_path: str, idf_path: str):
        """
        Load pre-computed vocabulary.
        
        Args:
            vocab_path: Path to vocabulary file
            idf_path: Path to IDF weights file
        """
        # Load vocabulary
        vocab_array = np.load(vocab_path, allow_pickle=True)
        
        # Recreate vocabulary dict
        self._vectorizer.vocabulary_ = {
            word: idx for idx, word in enumerate(vocab_array)
        }
        
        # Load IDF weights
        self._vectorizer.idf_ = np.load(idf_path)
        
        self.is_fitted = True
        self.vocabulary_size = len(vocab_array)
        
        logger.info(f"Loaded vocabulary of {self.vocabulary_size} n-grams")


def demonstrate_pattern_extraction():
    """
    Demonstrate how the vectorizer extracts character patterns.
    """
    # Example texts
    texts = [
        "Harry Potter and the Philosopher's Stone",
        "Harry Potter and the Chamber of Secrets",
        "The Lord of the Rings",
        "The Hobbit",
        "Quantum Mechanics"
    ]
    
    # Create vectorizer
    vectorizer = CharacterVectorizer(
        ngram_range=(3, 5),
        max_features=100
    )
    
    # Fit and analyze
    vectorizer.fit(texts)
    
    print("\nCharacter N-gram Analysis:")
    print("=" * 50)
    
    # Analyze first text
    analysis = vectorizer.analyze_text(texts[0])
    
    print(f"\nTop n-grams for: '{texts[0]}'")
    for ngram, score in list(analysis.items())[:10]:
        print(f"  '{ngram}': {score:.3f}")
    
    # Show pattern sharing between similar texts
    print("\nShared patterns between Harry Potter books:")
    hp1_ngrams = set(vectorizer.analyze_text(texts[0]).keys())
    hp2_ngrams = set(vectorizer.analyze_text(texts[1]).keys())
    shared = hp1_ngrams.intersection(hp2_ngrams)
    
    print(f"  Shared n-grams: {len(shared)}")
    print(f"  Examples: {list(shared)[:5]}")


if __name__ == "__main__":
    demonstrate_pattern_extraction()