| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | from __future__ import annotations |
| | import os |
| | import tempfile |
| | from pathlib import Path |
| | from typing import List, Optional, Union |
| | from dataclasses import dataclass |
| |
|
| | try: |
| | import sentencepiece as spm |
| | HAS_SENTENCEPIECE = True |
| | except ImportError: |
| | HAS_SENTENCEPIECE = False |
| | print("[rrpram] sentencepiece not found. Install it: pip install sentencepiece") |
| |
|
| |
|
| | @dataclass |
| | class RRPRAMVocab: |
| | """ |
| | RRPRAM Vocabulary: SentencePiece-based tokenizer for haze. |
| | |
| | Uses BPE or Unigram model to capture: |
| | - Frequent n-grams as single tokens |
| | - Subword patterns (morphology) |
| | - Resonant character sequences |
| | |
| | This is the first layer of pattern recognition—before attention, |
| | we're already finding structure in the text. |
| | """ |
| | |
| | model_path: str |
| | sp: "spm.SentencePieceProcessor" |
| | vocab_size: int |
| | |
| | @classmethod |
| | def train( |
| | cls, |
| | corpus_path: Union[str, Path], |
| | vocab_size: int = 1000, |
| | model_type: str = "bpe", |
| | model_prefix: Optional[str] = None, |
| | character_coverage: float = 1.0, |
| | max_sentence_length: int = 4192, |
| | user_defined_symbols: Optional[List[str]] = None, |
| | ) -> "RRPRAMVocab": |
| | """ |
| | Train a new SentencePiece model on corpus. |
| | |
| | Args: |
| | corpus_path: path to training text file |
| | vocab_size: target vocabulary size |
| | model_type: "bpe" (byte-pair), "unigram", "char", or "word" |
| | model_prefix: output model file prefix (default: temp file) |
| | character_coverage: fraction of characters to cover (1.0 = all) |
| | max_sentence_length: max chars per training sentence |
| | user_defined_symbols: custom symbols to include |
| | |
| | Returns: |
| | trained RRPRAMVocab instance |
| | """ |
| | if not HAS_SENTENCEPIECE: |
| | raise ImportError("sentencepiece required. Install: pip install sentencepiece") |
| | |
| | corpus_path = Path(corpus_path) |
| | if not corpus_path.exists(): |
| | raise FileNotFoundError(f"Corpus not found: {corpus_path}") |
| | |
| | |
| | if model_prefix is None: |
| | |
| | tmp_dir = tempfile.mkdtemp(prefix="rrpram_") |
| | model_prefix = os.path.join(tmp_dir, "rrpram") |
| | |
| | |
| | train_args = [ |
| | f"--input={corpus_path}", |
| | f"--model_prefix={model_prefix}", |
| | f"--vocab_size={vocab_size}", |
| | f"--model_type={model_type}", |
| | f"--character_coverage={character_coverage}", |
| | f"--max_sentence_length={max_sentence_length}", |
| | "--pad_id=0", |
| | "--unk_id=1", |
| | "--bos_id=2", |
| | "--eos_id=3", |
| | "--normalization_rule_name=identity", |
| | ] |
| | |
| | if user_defined_symbols: |
| | train_args.append(f"--user_defined_symbols={','.join(user_defined_symbols)}") |
| | |
| | |
| | print(f"[rrpram] training {model_type} model on {corpus_path}") |
| | print(f"[rrpram] vocab_size={vocab_size}, coverage={character_coverage}") |
| | spm.SentencePieceTrainer.Train(" ".join(train_args)) |
| | |
| | model_path = f"{model_prefix}.model" |
| | print(f"[rrpram] model saved to {model_path}") |
| | |
| | |
| | sp = spm.SentencePieceProcessor() |
| | sp.Load(model_path) |
| | |
| | return cls( |
| | model_path=model_path, |
| | sp=sp, |
| | vocab_size=sp.GetPieceSize(), |
| | ) |
| | |
| | @classmethod |
| | def load(cls, model_path: Union[str, Path]) -> "RRPRAMVocab": |
| | """Load a pre-trained SentencePiece model.""" |
| | if not HAS_SENTENCEPIECE: |
| | raise ImportError("sentencepiece required. Install: pip install sentencepiece") |
| | |
| | model_path = str(model_path) |
| | sp = spm.SentencePieceProcessor() |
| | sp.Load(model_path) |
| | |
| | return cls( |
| | model_path=model_path, |
| | sp=sp, |
| | vocab_size=sp.GetPieceSize(), |
| | ) |
| | |
| | def encode(self, text: str) -> List[int]: |
| | """Encode text to token IDs.""" |
| | return self.sp.EncodeAsIds(text) |
| | |
| | def decode(self, ids: List[int]) -> str: |
| | """Decode token IDs to text.""" |
| | return self.sp.DecodeIds(ids) |
| | |
| | def encode_pieces(self, text: str) -> List[str]: |
| | """Encode text to subword pieces (for visualization).""" |
| | return self.sp.EncodeAsPieces(text) |
| | |
| | def decode_pieces(self, pieces: List[str]) -> str: |
| | """Decode subword pieces to text.""" |
| | return self.sp.DecodePieces(pieces) |
| | |
| | def get_piece(self, id: int) -> str: |
| | """Get the piece (token) for a given ID.""" |
| | return self.sp.IdToPiece(id) |
| | |
| | def get_id(self, piece: str) -> int: |
| | """Get the ID for a given piece (token).""" |
| | return self.sp.PieceToId(piece) |
| | |
| | def __len__(self) -> int: |
| | return self.vocab_size |
| |
|
| |
|
| | def analyze_vocab(vocab: RRPRAMVocab, top_n: int = 50) -> None: |
| | """ |
| | Analyze and display vocabulary statistics. |
| | |
| | Shows the most common tokens (patterns) learned by the tokenizer. |
| | These are the "resonant patterns" that appear frequently in the corpus. |
| | """ |
| | print("=" * 60) |
| | print(" RRPRAM Vocabulary Analysis") |
| | print("=" * 60) |
| | print(f" vocab size: {vocab.vocab_size}") |
| | print() |
| | |
| | print(f" Top {top_n} tokens (resonant patterns):") |
| | print("-" * 40) |
| | |
| | for i in range(min(top_n, vocab.vocab_size)): |
| | piece = vocab.get_piece(i) |
| | |
| | display = piece.replace("▁", "_").replace("\n", "\\n") |
| | print(f" {i:4d}: '{display}'") |
| | |
| | print() |
| | print("=" * 60) |
| |
|
| |
|
| | def demo_tokenization(vocab: RRPRAMVocab, texts: List[str]) -> None: |
| | """ |
| | Demo tokenization on sample texts. |
| | |
| | Shows how the RRPRAM tokenizer breaks down text into patterns. |
| | """ |
| | print("=" * 60) |
| | print(" RRPRAM Tokenization Demo") |
| | print("=" * 60) |
| | |
| | for text in texts: |
| | print(f"\n input: \"{text}\"") |
| | ids = vocab.encode(text) |
| | pieces = vocab.encode_pieces(text) |
| | |
| | print(f" ids: {ids}") |
| | print(f" pieces: {pieces}") |
| | print(f" tokens: {len(ids)}") |
| | |
| | |
| | reconstructed = vocab.decode(ids) |
| | print(f" decoded: \"{reconstructed}\"") |
| | |
| | print() |
| | print("=" * 60) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | import sys |
| | |
| | print("=" * 60) |
| | print(" rrpram.py — RRPRAM Tokenizer") |
| | print("=" * 60) |
| | print() |
| | |
| | |
| | corpus_path = Path("text.txt") |
| | if not corpus_path.exists(): |
| | print("[error] text.txt not found") |
| | print() |
| | print("Usage:") |
| | print(" python rrpram.py # train on text.txt") |
| | print(" python rrpram.py corpus.txt # train on custom corpus") |
| | sys.exit(1) |
| | |
| | if len(sys.argv) > 1: |
| | corpus_path = Path(sys.argv[1]) |
| | |
| | print(f"[rrpram] corpus: {corpus_path}") |
| | |
| | |
| | vocab = RRPRAMVocab.train( |
| | corpus_path, |
| | vocab_size=500, |
| | model_type="bpe", |
| | character_coverage=1.0, |
| | ) |
| | |
| | |
| | analyze_vocab(vocab, top_n=30) |
| | |
| | |
| | demo_texts = [ |
| | "the haze settles", |
| | "darling", |
| | "I love you", |
| | "What's the toast?", |
| | ] |
| | demo_tokenization(vocab, demo_texts) |
| | |
| | print() |
| | print("[rrpram] done. patterns recognized. resonance achieved.") |
| |
|