Spaces:

reinforceai-labs
/

tejas

Build error

File size: 8,941 Bytes

b29bfaa

"""
Consciousness-Aligned Character N-gram Vectorizer
================================================

Extracts character n-grams matching human saccade patterns (3-5 characters).
This module handles the text → n-gram → TF-IDF transformation.

"""

import numpy as np
from typing import List, Dict, Tuple, Union
from sklearn.feature_extraction.text import TfidfVectorizer
import logging

logger = logging.getLogger(__name__)


class CharacterVectorizer:
    """
    Character n-gram vectorizer optimized for semantic fingerprinting.
    
    Key principles:
    - 3-5 character windows match human eye saccades
    - TF-IDF weighting captures semantic importance
    - Handles any Unicode text (including mathematical symbols)
    """
    
    def __init__(self, 
                 ngram_range: Tuple[int, int] = (3, 5),
                 max_features: int = 10000,
                 lowercase: bool = True,
                 dtype: type = np.float32):
        """
        Initialize the character vectorizer.
        
        Args:
            ngram_range: Character n-gram range (default 3-5 for saccades)
            max_features: Maximum number of features to extract
            lowercase: Convert to lowercase before extraction
            dtype: Data type for the matrix (float32 for efficiency)
        """
        self.ngram_range = ngram_range
        self.max_features = max_features
        self.lowercase = lowercase
        self.dtype = dtype
        
        # Internal sklearn vectorizer
        self._vectorizer = TfidfVectorizer(
            analyzer='char',
            ngram_range=ngram_range,
            max_features=max_features,
            lowercase=lowercase,
            dtype=dtype
        )
        
        # State tracking
        self.is_fitted = False
        self.vocabulary_size = 0
        
        logger.info(f"Initialized CharacterVectorizer with:")
        logger.info(f"  N-gram range: {ngram_range}")
        logger.info(f"  Max features: {max_features}")
        
    def fit(self, texts: List[str]) -> 'CharacterVectorizer':
        """
        Learn vocabulary from texts.
        
        Args:
            texts: List of text strings
            
        Returns:
            Self for chaining
        """
        logger.info(f"Fitting vectorizer on {len(texts)} texts...")
        
        self._vectorizer.fit(texts)
        self.is_fitted = True
        self.vocabulary_size = len(self._vectorizer.vocabulary_)
        
        logger.info(f"Learned vocabulary of {self.vocabulary_size} n-grams")
        
        # Log some statistics
        if self.vocabulary_size > 0:
            self._log_vocabulary_stats()
            
        return self
    
    def transform(self, texts: Union[str, List[str]]) -> np.ndarray:
        """
        Transform texts to TF-IDF vectors.
        
        Args:
            texts: Single text or list of texts
            
        Returns:
            TF-IDF matrix (sparse or dense depending on size)
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted before transform")
        
        # Handle single text
        if isinstance(texts, str):
            texts = [texts]
        
        # Transform
        X = self._vectorizer.transform(texts)
        
        # Convert to dense if small enough
        if X.shape[0] * X.shape[1] < 1e6:  # Less than 1M elements
            return X.toarray()
        else:
            return X  # Keep sparse for large matrices
    
    def fit_transform(self, texts: List[str]) -> np.ndarray:
        """
        Fit and transform in one step.
        
        Args:
            texts: List of texts
            
        Returns:
            TF-IDF matrix
        """
        return self.fit(texts).transform(texts)
    
    def get_feature_names(self) -> List[str]:
        """
        Get the learned n-gram features.
        
        Returns:
            List of n-gram strings
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted first")
            
        return self._vectorizer.get_feature_names_out().tolist()
    
    def get_vocabulary(self) -> Dict[str, int]:
        """
        Get the vocabulary mapping.
        
        Returns:
            Dict mapping n-grams to indices
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted first")
            
        return self._vectorizer.vocabulary_
    
    def get_idf_weights(self) -> np.ndarray:
        """
        Get the IDF weights for each feature.
        
        Returns:
            Array of IDF weights
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted first")
            
        return self._vectorizer.idf_
    
    def analyze_text(self, text: str) -> Dict[str, float]:
        """
        Analyze a single text and return its top n-grams.
        
        Args:
            text: Input text
            
        Returns:
            Dict of n-grams and their TF-IDF scores
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted first")
        
        # Transform the text
        vector = self.transform(text).flatten()
        
        # Get non-zero indices
        nonzero_idx = np.nonzero(vector)[0]
        
        # Get feature names
        feature_names = self.get_feature_names()
        
        # Create result dict
        result = {}
        for idx in nonzero_idx:
            ngram = feature_names[idx]
            score = vector[idx]
            result[ngram] = float(score)
        
        # Sort by score
        return dict(sorted(result.items(), key=lambda x: x[1], reverse=True))
    
    def _log_vocabulary_stats(self):
        """Log statistics about the learned vocabulary."""
        feature_names = self.get_feature_names()
        
        # Count by n-gram size
        ngram_counts = {}
        for n in range(self.ngram_range[0], self.ngram_range[1] + 1):
            count = sum(1 for f in feature_names if len(f) == n)
            ngram_counts[n] = count
            
        logger.info("Vocabulary breakdown by n-gram size:")
        for n, count in ngram_counts.items():
            percentage = count / self.vocabulary_size * 100
            logger.info(f"  {n}-grams: {count} ({percentage:.1f}%)")
    
    def save_vocabulary(self, filepath: str):
        """
        Save vocabulary to file.
        
        Args:
            filepath: Path to save vocabulary
        """
        if not self.is_fitted:
            raise ValueError("Vectorizer must be fitted first")
        
        vocab_items = sorted(self.get_vocabulary().items(), key=lambda x: x[1])
        vocab_array = np.array([item[0] for item in vocab_items], dtype=object)
        
        np.save(filepath, vocab_array)
        logger.info(f"Saved vocabulary to {filepath}")
    
    def load_vocabulary(self, vocab_path: str, idf_path: str):
        """
        Load pre-computed vocabulary.
        
        Args:
            vocab_path: Path to vocabulary file
            idf_path: Path to IDF weights file
        """
        # Load vocabulary
        vocab_array = np.load(vocab_path, allow_pickle=True)
        
        # Recreate vocabulary dict
        self._vectorizer.vocabulary_ = {
            word: idx for idx, word in enumerate(vocab_array)
        }
        
        # Load IDF weights
        self._vectorizer.idf_ = np.load(idf_path)
        
        self.is_fitted = True
        self.vocabulary_size = len(vocab_array)
        
        logger.info(f"Loaded vocabulary of {self.vocabulary_size} n-grams")


def demonstrate_pattern_extraction():
    """
    Demonstrate how the vectorizer extracts character patterns.
    """
    # Example texts
    texts = [
        "Harry Potter and the Philosopher's Stone",
        "Harry Potter and the Chamber of Secrets",
        "The Lord of the Rings",
        "The Hobbit",
        "Quantum Mechanics"
    ]
    
    # Create vectorizer
    vectorizer = CharacterVectorizer(
        ngram_range=(3, 5),
        max_features=100
    )
    
    # Fit and analyze
    vectorizer.fit(texts)
    
    print("\nCharacter N-gram Analysis:")
    print("=" * 50)
    
    # Analyze first text
    analysis = vectorizer.analyze_text(texts[0])
    
    print(f"\nTop n-grams for: '{texts[0]}'")
    for ngram, score in list(analysis.items())[:10]:
        print(f"  '{ngram}': {score:.3f}")
    
    # Show pattern sharing between similar texts
    print("\nShared patterns between Harry Potter books:")
    hp1_ngrams = set(vectorizer.analyze_text(texts[0]).keys())
    hp2_ngrams = set(vectorizer.analyze_text(texts[1]).keys())
    shared = hp1_ngrams.intersection(hp2_ngrams)
    
    print(f"  Shared n-grams: {len(shared)}")
    print(f"  Examples: {list(shared)[:5]}")


if __name__ == "__main__":
    demonstrate_pattern_extraction()