haze / cloud /rrpram_cloud.py
ataeff's picture
Upload 13 files
3279f65 verified
#!/usr/bin/env python3
# rrpram.py — Recursive Resonant Pattern Recognition Attention Mechanism Tokenizer
#
# SentencePiece-based tokenization for haze.
# Captures n-grams, subwords, and resonant patterns directly in the vocabulary.
#
# Why "rrpram"? Because the tokenizer IS the first layer of pattern recognition.
# Before attention even runs, we're already finding patterns.
#
# Usage:
# from haze.rrpram import RRPRAMVocab
# vocab = RRPRAMVocab.train("text.txt", vocab_size=1000)
# tokens = vocab.encode("the haze settles")
# text = vocab.decode(tokens)
from __future__ import annotations
import os
import tempfile
from pathlib import Path
from typing import List, Optional, Union
from dataclasses import dataclass
try:
import sentencepiece as spm
HAS_SENTENCEPIECE = True
except ImportError:
HAS_SENTENCEPIECE = False
print("[rrpram] sentencepiece not found. Install it: pip install sentencepiece")
@dataclass
class RRPRAMVocab:
"""
RRPRAM Vocabulary: SentencePiece-based tokenizer for haze.
Uses BPE or Unigram model to capture:
- Frequent n-grams as single tokens
- Subword patterns (morphology)
- Resonant character sequences
This is the first layer of pattern recognition—before attention,
we're already finding structure in the text.
"""
model_path: str
sp: "spm.SentencePieceProcessor"
vocab_size: int
@classmethod
def train(
cls,
corpus_path: Union[str, Path],
vocab_size: int = 1000,
model_type: str = "bpe", # "bpe", "unigram", "char", "word"
model_prefix: Optional[str] = None,
character_coverage: float = 1.0,
max_sentence_length: int = 4192,
user_defined_symbols: Optional[List[str]] = None,
) -> "RRPRAMVocab":
"""
Train a new SentencePiece model on corpus.
Args:
corpus_path: path to training text file
vocab_size: target vocabulary size
model_type: "bpe" (byte-pair), "unigram", "char", or "word"
model_prefix: output model file prefix (default: temp file)
character_coverage: fraction of characters to cover (1.0 = all)
max_sentence_length: max chars per training sentence
user_defined_symbols: custom symbols to include
Returns:
trained RRPRAMVocab instance
"""
if not HAS_SENTENCEPIECE:
raise ImportError("sentencepiece required. Install: pip install sentencepiece")
corpus_path = Path(corpus_path)
if not corpus_path.exists():
raise FileNotFoundError(f"Corpus not found: {corpus_path}")
# determine model output path
if model_prefix is None:
# create temp directory for model files
tmp_dir = tempfile.mkdtemp(prefix="rrpram_")
model_prefix = os.path.join(tmp_dir, "rrpram")
# build training command
train_args = [
f"--input={corpus_path}",
f"--model_prefix={model_prefix}",
f"--vocab_size={vocab_size}",
f"--model_type={model_type}",
f"--character_coverage={character_coverage}",
f"--max_sentence_length={max_sentence_length}",
"--pad_id=0",
"--unk_id=1",
"--bos_id=2",
"--eos_id=3",
"--normalization_rule_name=identity", # preserve case and chars
]
if user_defined_symbols:
train_args.append(f"--user_defined_symbols={','.join(user_defined_symbols)}")
# train
print(f"[rrpram] training {model_type} model on {corpus_path}")
print(f"[rrpram] vocab_size={vocab_size}, coverage={character_coverage}")
spm.SentencePieceTrainer.Train(" ".join(train_args))
model_path = f"{model_prefix}.model"
print(f"[rrpram] model saved to {model_path}")
# load trained model
sp = spm.SentencePieceProcessor()
sp.Load(model_path)
return cls(
model_path=model_path,
sp=sp,
vocab_size=sp.GetPieceSize(),
)
@classmethod
def load(cls, model_path: Union[str, Path]) -> "RRPRAMVocab":
"""Load a pre-trained SentencePiece model."""
if not HAS_SENTENCEPIECE:
raise ImportError("sentencepiece required. Install: pip install sentencepiece")
model_path = str(model_path)
sp = spm.SentencePieceProcessor()
sp.Load(model_path)
return cls(
model_path=model_path,
sp=sp,
vocab_size=sp.GetPieceSize(),
)
def encode(self, text: str) -> List[int]:
"""Encode text to token IDs."""
return self.sp.EncodeAsIds(text)
def decode(self, ids: List[int]) -> str:
"""Decode token IDs to text."""
return self.sp.DecodeIds(ids)
def encode_pieces(self, text: str) -> List[str]:
"""Encode text to subword pieces (for visualization)."""
return self.sp.EncodeAsPieces(text)
def decode_pieces(self, pieces: List[str]) -> str:
"""Decode subword pieces to text."""
return self.sp.DecodePieces(pieces)
def get_piece(self, id: int) -> str:
"""Get the piece (token) for a given ID."""
return self.sp.IdToPiece(id)
def get_id(self, piece: str) -> int:
"""Get the ID for a given piece (token)."""
return self.sp.PieceToId(piece)
def __len__(self) -> int:
return self.vocab_size
def analyze_vocab(vocab: RRPRAMVocab, top_n: int = 50) -> None:
"""
Analyze and display vocabulary statistics.
Shows the most common tokens (patterns) learned by the tokenizer.
These are the "resonant patterns" that appear frequently in the corpus.
"""
print("=" * 60)
print(" RRPRAM Vocabulary Analysis")
print("=" * 60)
print(f" vocab size: {vocab.vocab_size}")
print()
print(f" Top {top_n} tokens (resonant patterns):")
print("-" * 40)
for i in range(min(top_n, vocab.vocab_size)):
piece = vocab.get_piece(i)
# visualize special chars
display = piece.replace("▁", "_").replace("\n", "\\n")
print(f" {i:4d}: '{display}'")
print()
print("=" * 60)
def demo_tokenization(vocab: RRPRAMVocab, texts: List[str]) -> None:
"""
Demo tokenization on sample texts.
Shows how the RRPRAM tokenizer breaks down text into patterns.
"""
print("=" * 60)
print(" RRPRAM Tokenization Demo")
print("=" * 60)
for text in texts:
print(f"\n input: \"{text}\"")
ids = vocab.encode(text)
pieces = vocab.encode_pieces(text)
print(f" ids: {ids}")
print(f" pieces: {pieces}")
print(f" tokens: {len(ids)}")
# show reconstruction
reconstructed = vocab.decode(ids)
print(f" decoded: \"{reconstructed}\"")
print()
print("=" * 60)
if __name__ == "__main__":
import sys
print("=" * 60)
print(" rrpram.py — RRPRAM Tokenizer")
print("=" * 60)
print()
# check if corpus exists
corpus_path = Path("text.txt")
if not corpus_path.exists():
print("[error] text.txt not found")
print()
print("Usage:")
print(" python rrpram.py # train on text.txt")
print(" python rrpram.py corpus.txt # train on custom corpus")
sys.exit(1)
if len(sys.argv) > 1:
corpus_path = Path(sys.argv[1])
print(f"[rrpram] corpus: {corpus_path}")
# train tokenizer
vocab = RRPRAMVocab.train(
corpus_path,
vocab_size=500,
model_type="bpe",
character_coverage=1.0,
)
# analyze
analyze_vocab(vocab, top_n=30)
# demo
demo_texts = [
"the haze settles",
"darling",
"I love you",
"What's the toast?",
]
demo_tokenization(vocab, demo_texts)
print()
print("[rrpram] done. patterns recognized. resonance achieved.")