File size: 8,222 Bytes
3279f65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!/usr/bin/env python3
# rrpram.py — Recursive Resonant Pattern Recognition Attention Mechanism Tokenizer
#
# SentencePiece-based tokenization for haze.
# Captures n-grams, subwords, and resonant patterns directly in the vocabulary.
#
# Why "rrpram"? Because the tokenizer IS the first layer of pattern recognition.
# Before attention even runs, we're already finding patterns.
#
# Usage:
#   from haze.rrpram import RRPRAMVocab
#   vocab = RRPRAMVocab.train("text.txt", vocab_size=1000)
#   tokens = vocab.encode("the haze settles")
#   text = vocab.decode(tokens)

from __future__ import annotations
import os
import tempfile
from pathlib import Path
from typing import List, Optional, Union
from dataclasses import dataclass

try:
    import sentencepiece as spm
    HAS_SENTENCEPIECE = True
except ImportError:
    HAS_SENTENCEPIECE = False
    print("[rrpram] sentencepiece not found. Install it: pip install sentencepiece")


@dataclass
class RRPRAMVocab:
    """
    RRPRAM Vocabulary: SentencePiece-based tokenizer for haze.
    
    Uses BPE or Unigram model to capture:
    - Frequent n-grams as single tokens
    - Subword patterns (morphology)
    - Resonant character sequences
    
    This is the first layer of pattern recognition—before attention,
    we're already finding structure in the text.
    """
    
    model_path: str
    sp: "spm.SentencePieceProcessor"
    vocab_size: int
    
    @classmethod
    def train(
        cls,
        corpus_path: Union[str, Path],
        vocab_size: int = 1000,
        model_type: str = "bpe",  # "bpe", "unigram", "char", "word"
        model_prefix: Optional[str] = None,
        character_coverage: float = 1.0,
        max_sentence_length: int = 4192,
        user_defined_symbols: Optional[List[str]] = None,
    ) -> "RRPRAMVocab":
        """
        Train a new SentencePiece model on corpus.
        
        Args:
            corpus_path: path to training text file
            vocab_size: target vocabulary size
            model_type: "bpe" (byte-pair), "unigram", "char", or "word"
            model_prefix: output model file prefix (default: temp file)
            character_coverage: fraction of characters to cover (1.0 = all)
            max_sentence_length: max chars per training sentence
            user_defined_symbols: custom symbols to include
        
        Returns:
            trained RRPRAMVocab instance
        """
        if not HAS_SENTENCEPIECE:
            raise ImportError("sentencepiece required. Install: pip install sentencepiece")
        
        corpus_path = Path(corpus_path)
        if not corpus_path.exists():
            raise FileNotFoundError(f"Corpus not found: {corpus_path}")
        
        # determine model output path
        if model_prefix is None:
            # create temp directory for model files
            tmp_dir = tempfile.mkdtemp(prefix="rrpram_")
            model_prefix = os.path.join(tmp_dir, "rrpram")
        
        # build training command
        train_args = [
            f"--input={corpus_path}",
            f"--model_prefix={model_prefix}",
            f"--vocab_size={vocab_size}",
            f"--model_type={model_type}",
            f"--character_coverage={character_coverage}",
            f"--max_sentence_length={max_sentence_length}",
            "--pad_id=0",
            "--unk_id=1",
            "--bos_id=2",
            "--eos_id=3",
            "--normalization_rule_name=identity",  # preserve case and chars
        ]
        
        if user_defined_symbols:
            train_args.append(f"--user_defined_symbols={','.join(user_defined_symbols)}")
        
        # train
        print(f"[rrpram] training {model_type} model on {corpus_path}")
        print(f"[rrpram] vocab_size={vocab_size}, coverage={character_coverage}")
        spm.SentencePieceTrainer.Train(" ".join(train_args))
        
        model_path = f"{model_prefix}.model"
        print(f"[rrpram] model saved to {model_path}")
        
        # load trained model
        sp = spm.SentencePieceProcessor()
        sp.Load(model_path)
        
        return cls(
            model_path=model_path,
            sp=sp,
            vocab_size=sp.GetPieceSize(),
        )
    
    @classmethod
    def load(cls, model_path: Union[str, Path]) -> "RRPRAMVocab":
        """Load a pre-trained SentencePiece model."""
        if not HAS_SENTENCEPIECE:
            raise ImportError("sentencepiece required. Install: pip install sentencepiece")
        
        model_path = str(model_path)
        sp = spm.SentencePieceProcessor()
        sp.Load(model_path)
        
        return cls(
            model_path=model_path,
            sp=sp,
            vocab_size=sp.GetPieceSize(),
        )
    
    def encode(self, text: str) -> List[int]:
        """Encode text to token IDs."""
        return self.sp.EncodeAsIds(text)
    
    def decode(self, ids: List[int]) -> str:
        """Decode token IDs to text."""
        return self.sp.DecodeIds(ids)
    
    def encode_pieces(self, text: str) -> List[str]:
        """Encode text to subword pieces (for visualization)."""
        return self.sp.EncodeAsPieces(text)
    
    def decode_pieces(self, pieces: List[str]) -> str:
        """Decode subword pieces to text."""
        return self.sp.DecodePieces(pieces)
    
    def get_piece(self, id: int) -> str:
        """Get the piece (token) for a given ID."""
        return self.sp.IdToPiece(id)
    
    def get_id(self, piece: str) -> int:
        """Get the ID for a given piece (token)."""
        return self.sp.PieceToId(piece)
    
    def __len__(self) -> int:
        return self.vocab_size


def analyze_vocab(vocab: RRPRAMVocab, top_n: int = 50) -> None:
    """
    Analyze and display vocabulary statistics.
    
    Shows the most common tokens (patterns) learned by the tokenizer.
    These are the "resonant patterns" that appear frequently in the corpus.
    """
    print("=" * 60)
    print("  RRPRAM Vocabulary Analysis")
    print("=" * 60)
    print(f"  vocab size: {vocab.vocab_size}")
    print()
    
    print(f"  Top {top_n} tokens (resonant patterns):")
    print("-" * 40)
    
    for i in range(min(top_n, vocab.vocab_size)):
        piece = vocab.get_piece(i)
        # visualize special chars
        display = piece.replace("▁", "_").replace("\n", "\\n")
        print(f"  {i:4d}: '{display}'")
    
    print()
    print("=" * 60)


def demo_tokenization(vocab: RRPRAMVocab, texts: List[str]) -> None:
    """
    Demo tokenization on sample texts.
    
    Shows how the RRPRAM tokenizer breaks down text into patterns.
    """
    print("=" * 60)
    print("  RRPRAM Tokenization Demo")
    print("=" * 60)
    
    for text in texts:
        print(f"\n  input: \"{text}\"")
        ids = vocab.encode(text)
        pieces = vocab.encode_pieces(text)
        
        print(f"  ids:   {ids}")
        print(f"  pieces: {pieces}")
        print(f"  tokens: {len(ids)}")
        
        # show reconstruction
        reconstructed = vocab.decode(ids)
        print(f"  decoded: \"{reconstructed}\"")
    
    print()
    print("=" * 60)


if __name__ == "__main__":
    import sys
    
    print("=" * 60)
    print("  rrpram.py — RRPRAM Tokenizer")
    print("=" * 60)
    print()
    
    # check if corpus exists
    corpus_path = Path("text.txt")
    if not corpus_path.exists():
        print("[error] text.txt not found")
        print()
        print("Usage:")
        print("  python rrpram.py           # train on text.txt")
        print("  python rrpram.py corpus.txt  # train on custom corpus")
        sys.exit(1)
    
    if len(sys.argv) > 1:
        corpus_path = Path(sys.argv[1])
    
    print(f"[rrpram] corpus: {corpus_path}")
    
    # train tokenizer
    vocab = RRPRAMVocab.train(
        corpus_path,
        vocab_size=500,
        model_type="bpe",
        character_coverage=1.0,
    )
    
    # analyze
    analyze_vocab(vocab, top_n=30)
    
    # demo
    demo_texts = [
        "the haze settles",
        "darling",
        "I love you",
        "What's the toast?",
    ]
    demo_tokenization(vocab, demo_texts)
    
    print()
    print("[rrpram] done. patterns recognized. resonance achieved.")