""" Consciousness-Aligned Character N-gram Vectorizer ================================================ Extracts character n-grams matching human saccade patterns (3-5 characters). This module handles the text → n-gram → TF-IDF transformation. """ import numpy as np from typing import List, Dict, Tuple, Union from sklearn.feature_extraction.text import TfidfVectorizer import logging logger = logging.getLogger(__name__) class CharacterVectorizer: """ Character n-gram vectorizer optimized for semantic fingerprinting. Key principles: - 3-5 character windows match human eye saccades - TF-IDF weighting captures semantic importance - Handles any Unicode text (including mathematical symbols) """ def __init__(self, ngram_range: Tuple[int, int] = (3, 5), max_features: int = 10000, lowercase: bool = True, dtype: type = np.float32): """ Initialize the character vectorizer. Args: ngram_range: Character n-gram range (default 3-5 for saccades) max_features: Maximum number of features to extract lowercase: Convert to lowercase before extraction dtype: Data type for the matrix (float32 for efficiency) """ self.ngram_range = ngram_range self.max_features = max_features self.lowercase = lowercase self.dtype = dtype # Internal sklearn vectorizer self._vectorizer = TfidfVectorizer( analyzer='char', ngram_range=ngram_range, max_features=max_features, lowercase=lowercase, dtype=dtype ) # State tracking self.is_fitted = False self.vocabulary_size = 0 logger.info(f"Initialized CharacterVectorizer with:") logger.info(f" N-gram range: {ngram_range}") logger.info(f" Max features: {max_features}") def fit(self, texts: List[str]) -> 'CharacterVectorizer': """ Learn vocabulary from texts. Args: texts: List of text strings Returns: Self for chaining """ logger.info(f"Fitting vectorizer on {len(texts)} texts...") self._vectorizer.fit(texts) self.is_fitted = True self.vocabulary_size = len(self._vectorizer.vocabulary_) logger.info(f"Learned vocabulary of {self.vocabulary_size} n-grams") # Log some statistics if self.vocabulary_size > 0: self._log_vocabulary_stats() return self def transform(self, texts: Union[str, List[str]]) -> np.ndarray: """ Transform texts to TF-IDF vectors. Args: texts: Single text or list of texts Returns: TF-IDF matrix (sparse or dense depending on size) """ if not self.is_fitted: raise ValueError("Vectorizer must be fitted before transform") # Handle single text if isinstance(texts, str): texts = [texts] # Transform X = self._vectorizer.transform(texts) # Convert to dense if small enough if X.shape[0] * X.shape[1] < 1e6: # Less than 1M elements return X.toarray() else: return X # Keep sparse for large matrices def fit_transform(self, texts: List[str]) -> np.ndarray: """ Fit and transform in one step. Args: texts: List of texts Returns: TF-IDF matrix """ return self.fit(texts).transform(texts) def get_feature_names(self) -> List[str]: """ Get the learned n-gram features. Returns: List of n-gram strings """ if not self.is_fitted: raise ValueError("Vectorizer must be fitted first") return self._vectorizer.get_feature_names_out().tolist() def get_vocabulary(self) -> Dict[str, int]: """ Get the vocabulary mapping. Returns: Dict mapping n-grams to indices """ if not self.is_fitted: raise ValueError("Vectorizer must be fitted first") return self._vectorizer.vocabulary_ def get_idf_weights(self) -> np.ndarray: """ Get the IDF weights for each feature. Returns: Array of IDF weights """ if not self.is_fitted: raise ValueError("Vectorizer must be fitted first") return self._vectorizer.idf_ def analyze_text(self, text: str) -> Dict[str, float]: """ Analyze a single text and return its top n-grams. Args: text: Input text Returns: Dict of n-grams and their TF-IDF scores """ if not self.is_fitted: raise ValueError("Vectorizer must be fitted first") # Transform the text vector = self.transform(text).flatten() # Get non-zero indices nonzero_idx = np.nonzero(vector)[0] # Get feature names feature_names = self.get_feature_names() # Create result dict result = {} for idx in nonzero_idx: ngram = feature_names[idx] score = vector[idx] result[ngram] = float(score) # Sort by score return dict(sorted(result.items(), key=lambda x: x[1], reverse=True)) def _log_vocabulary_stats(self): """Log statistics about the learned vocabulary.""" feature_names = self.get_feature_names() # Count by n-gram size ngram_counts = {} for n in range(self.ngram_range[0], self.ngram_range[1] + 1): count = sum(1 for f in feature_names if len(f) == n) ngram_counts[n] = count logger.info("Vocabulary breakdown by n-gram size:") for n, count in ngram_counts.items(): percentage = count / self.vocabulary_size * 100 logger.info(f" {n}-grams: {count} ({percentage:.1f}%)") def save_vocabulary(self, filepath: str): """ Save vocabulary to file. Args: filepath: Path to save vocabulary """ if not self.is_fitted: raise ValueError("Vectorizer must be fitted first") vocab_items = sorted(self.get_vocabulary().items(), key=lambda x: x[1]) vocab_array = np.array([item[0] for item in vocab_items], dtype=object) np.save(filepath, vocab_array) logger.info(f"Saved vocabulary to {filepath}") def load_vocabulary(self, vocab_path: str, idf_path: str): """ Load pre-computed vocabulary. Args: vocab_path: Path to vocabulary file idf_path: Path to IDF weights file """ # Load vocabulary vocab_array = np.load(vocab_path, allow_pickle=True) # Recreate vocabulary dict self._vectorizer.vocabulary_ = { word: idx for idx, word in enumerate(vocab_array) } # Load IDF weights self._vectorizer.idf_ = np.load(idf_path) self.is_fitted = True self.vocabulary_size = len(vocab_array) logger.info(f"Loaded vocabulary of {self.vocabulary_size} n-grams") def demonstrate_pattern_extraction(): """ Demonstrate how the vectorizer extracts character patterns. """ # Example texts texts = [ "Harry Potter and the Philosopher's Stone", "Harry Potter and the Chamber of Secrets", "The Lord of the Rings", "The Hobbit", "Quantum Mechanics" ] # Create vectorizer vectorizer = CharacterVectorizer( ngram_range=(3, 5), max_features=100 ) # Fit and analyze vectorizer.fit(texts) print("\nCharacter N-gram Analysis:") print("=" * 50) # Analyze first text analysis = vectorizer.analyze_text(texts[0]) print(f"\nTop n-grams for: '{texts[0]}'") for ngram, score in list(analysis.items())[:10]: print(f" '{ngram}': {score:.3f}") # Show pattern sharing between similar texts print("\nShared patterns between Harry Potter books:") hp1_ngrams = set(vectorizer.analyze_text(texts[0]).keys()) hp2_ngrams = set(vectorizer.analyze_text(texts[1]).keys()) shared = hp1_ngrams.intersection(hp2_ngrams) print(f" Shared n-grams: {len(shared)}") print(f" Examples: {list(shared)[:5]}") if __name__ == "__main__": demonstrate_pattern_extraction()