File size: 8,222 Bytes

3279f65

#!/usr/bin/env python3
# rrpram.py — Recursive Resonant Pattern Recognition Attention Mechanism Tokenizer
#
# SentencePiece-based tokenization for haze.
# Captures n-grams, subwords, and resonant patterns directly in the vocabulary.
#
# Why "rrpram"? Because the tokenizer IS the first layer of pattern recognition.
# Before attention even runs, we're already finding patterns.
#
# Usage:
#   from haze.rrpram import RRPRAMVocab
#   vocab = RRPRAMVocab.train("text.txt", vocab_size=1000)
#   tokens = vocab.encode("the haze settles")
#   text = vocab.decode(tokens)

from __future__ import annotations
import os
import tempfile
from pathlib import Path
from typing import List, Optional, Union
from dataclasses import dataclass

try:
    import sentencepiece as spm
    HAS_SENTENCEPIECE = True
except ImportError:
    HAS_SENTENCEPIECE = False
    print("[rrpram] sentencepiece not found. Install it: pip install sentencepiece")


@dataclass
class RRPRAMVocab:
    """
    RRPRAM Vocabulary: SentencePiece-based tokenizer for haze.
    
    Uses BPE or Unigram model to capture:
    - Frequent n-grams as single tokens
    - Subword patterns (morphology)
    - Resonant character sequences
    
    This is the first layer of pattern recognition—before attention,
    we're already finding structure in the text.
    """
    
    model_path: str
    sp: "spm.SentencePieceProcessor"
    vocab_size: int
    
    @classmethod
    def train(
        cls,
        corpus_path: Union[str, Path],
        vocab_size: int = 1000,
        model_type: str = "bpe",  # "bpe", "unigram", "char", "word"
        model_prefix: Optional[str] = None,
        character_coverage: float = 1.0,
        max_sentence_length: int = 4192,
        user_defined_symbols: Optional[List[str]] = None,
    ) -> "RRPRAMVocab":
        """
        Train a new SentencePiece model on corpus.
        
        Args:
            corpus_path: path to training text file
            vocab_size: target vocabulary size
            model_type: "bpe" (byte-pair), "unigram", "char", or "word"
            model_prefix: output model file prefix (default: temp file)
            character_coverage: fraction of characters to cover (1.0 = all)
            max_sentence_length: max chars per training sentence
            user_defined_symbols: custom symbols to include
        
        Returns:
            trained RRPRAMVocab instance
        """
        if not HAS_SENTENCEPIECE:
            raise ImportError("sentencepiece required. Install: pip install sentencepiece")
        
        corpus_path = Path(corpus_path)
        if not corpus_path.exists():
            raise FileNotFoundError(f"Corpus not found: {corpus_path}")
        
        # determine model output path
        if model_prefix is None:
            # create temp directory for model files
            tmp_dir = tempfile.mkdtemp(prefix="rrpram_")
            model_prefix = os.path.join(tmp_dir, "rrpram")
        
        # build training command
        train_args = [
            f"--input={corpus_path}",
            f"--model_prefix={model_prefix}",
            f"--vocab_size={vocab_size}",
            f"--model_type={model_type}",
            f"--character_coverage={character_coverage}",
            f"--max_sentence_length={max_sentence_length}",
            "--pad_id=0",
            "--unk_id=1",
            "--bos_id=2",
            "--eos_id=3",
            "--normalization_rule_name=identity",  # preserve case and chars
        ]
        
        if user_defined_symbols:
            train_args.append(f"--user_defined_symbols={','.join(user_defined_symbols)}")
        
        # train
        print(f"[rrpram] training {model_type} model on {corpus_path}")
        print(f"[rrpram] vocab_size={vocab_size}, coverage={character_coverage}")
        spm.SentencePieceTrainer.Train(" ".join(train_args))
        
        model_path = f"{model_prefix}.model"
        print(f"[rrpram] model saved to {model_path}")
        
        # load trained model
        sp = spm.SentencePieceProcessor()
        sp.Load(model_path)
        
        return cls(
            model_path=model_path,
            sp=sp,
            vocab_size=sp.GetPieceSize(),
        )
    
    @classmethod
    def load(cls, model_path: Union[str, Path]) -> "RRPRAMVocab":
        """Load a pre-trained SentencePiece model."""
        if not HAS_SENTENCEPIECE:
            raise ImportError("sentencepiece required. Install: pip install sentencepiece")
        
        model_path = str(model_path)
        sp = spm.SentencePieceProcessor()
        sp.Load(model_path)
        
        return cls(
            model_path=model_path,
            sp=sp,
            vocab_size=sp.GetPieceSize(),
        )
    
    def encode(self, text: str) -> List[int]:
        """Encode text to token IDs."""
        return self.sp.EncodeAsIds(text)
    
    def decode(self, ids: List[int]) -> str:
        """Decode token IDs to text."""
        return self.sp.DecodeIds(ids)
    
    def encode_pieces(self, text: str) -> List[str]:
        """Encode text to subword pieces (for visualization)."""
        return self.sp.EncodeAsPieces(text)
    
    def decode_pieces(self, pieces: List[str]) -> str:
        """Decode subword pieces to text."""
        return self.sp.DecodePieces(pieces)
    
    def get_piece(self, id: int) -> str:
        """Get the piece (token) for a given ID."""
        return self.sp.IdToPiece(id)
    
    def get_id(self, piece: str) -> int:
        """Get the ID for a given piece (token)."""
        return self.sp.PieceToId(piece)
    
    def __len__(self) -> int:
        return self.vocab_size


def analyze_vocab(vocab: RRPRAMVocab, top_n: int = 50) -> None:
    """
    Analyze and display vocabulary statistics.
    
    Shows the most common tokens (patterns) learned by the tokenizer.
    These are the "resonant patterns" that appear frequently in the corpus.
    """
    print("=" * 60)
    print("  RRPRAM Vocabulary Analysis")
    print("=" * 60)
    print(f"  vocab size: {vocab.vocab_size}")
    print()
    
    print(f"  Top {top_n} tokens (resonant patterns):")
    print("-" * 40)
    
    for i in range(min(top_n, vocab.vocab_size)):
        piece = vocab.get_piece(i)
        # visualize special chars
        display = piece.replace("▁", "_").replace("\n", "\\n")
        print(f"  {i:4d}: '{display}'")
    
    print()
    print("=" * 60)


def demo_tokenization(vocab: RRPRAMVocab, texts: List[str]) -> None:
    """
    Demo tokenization on sample texts.
    
    Shows how the RRPRAM tokenizer breaks down text into patterns.
    """
    print("=" * 60)
    print("  RRPRAM Tokenization Demo")
    print("=" * 60)
    
    for text in texts:
        print(f"\n  input: \"{text}\"")
        ids = vocab.encode(text)
        pieces = vocab.encode_pieces(text)
        
        print(f"  ids:   {ids}")
        print(f"  pieces: {pieces}")
        print(f"  tokens: {len(ids)}")
        
        # show reconstruction
        reconstructed = vocab.decode(ids)
        print(f"  decoded: \"{reconstructed}\"")
    
    print()
    print("=" * 60)


if __name__ == "__main__":
    import sys
    
    print("=" * 60)
    print("  rrpram.py — RRPRAM Tokenizer")
    print("=" * 60)
    print()
    
    # check if corpus exists
    corpus_path = Path("text.txt")
    if not corpus_path.exists():
        print("[error] text.txt not found")
        print()
        print("Usage:")
        print("  python rrpram.py           # train on text.txt")
        print("  python rrpram.py corpus.txt  # train on custom corpus")
        sys.exit(1)
    
    if len(sys.argv) > 1:
        corpus_path = Path(sys.argv[1])
    
    print(f"[rrpram] corpus: {corpus_path}")
    
    # train tokenizer
    vocab = RRPRAMVocab.train(
        corpus_path,
        vocab_size=500,
        model_type="bpe",
        character_coverage=1.0,
    )
    
    # analyze
    analyze_vocab(vocab, top_n=30)
    
    # demo
    demo_texts = [
        "the haze settles",
        "darling",
        "I love you",
        "What's the toast?",
    ]
    demo_tokenization(vocab, demo_texts)
    
    print()
    print("[rrpram] done. patterns recognized. resonance achieved.")