Use LLaMA 2 pretrained tokenizer and remove tokenizer_mode option

Replace custom tokenizer training/loading modes with a single pretrained
LLaMA 2 tokenizer (NousResearch/Llama-2-7b-hf, 32K vocab). This simplifies
setup_data_pipeline() to require no tokenizer arguments, using
DataConfig.tokenizer_name as the single source of truth.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

Files changed (8) hide show

llm_lab/config/data_config.py +4 -4
llm_lab/config/model_config.py +1 -1
llm_lab/data/pipeline.py +4 -30
llm_lab/data/tokenizer.py +8 -4
notebooks/01_data_pipeline.ipynb +5 -45
notebooks/03_training.ipynb +1 -11
notebooks/04_evaluation.ipynb +3 -7
notebooks/05_debugging.ipynb +1 -11

llm_lab/config/data_config.py CHANGED Viewed

@@ -18,10 +18,10 @@ class DataConfig:
     text_column: str = "text"                  # column name containing text
     # ── Tokenizer ──
-    tokenizer_type: str = "sentencepiece"      # "sentencepiece" or "hf"
-    # path to a pretrained tokenizer (trains a new one if not provided)
-    tokenizer_path: Optional[str] = None
-    vocab_size: int = 32_000
     # ── Sequence ──
     max_seq_len: int = 2048

     text_column: str = "text"                  # column name containing text
     # ── Tokenizer ──
+    # LLaMA 2 tokenizer mirror — no HuggingFace authentication required
+    # SentencePiece BPE, 32K vocab — optimal for 1B-scale models
+    tokenizer_name: str = "NousResearch/Llama-2-7b-hf"
+    vocab_size: int = 32_000        # LLaMA 2 vocab size
     # ── Sequence ──
     max_seq_len: int = 2048

llm_lab/config/model_config.py CHANGED Viewed

@@ -10,7 +10,7 @@ class ModelConfig:
       - small:  ~100M (for intermediate validation)
       - base:   ~1.1B (final target)
     """
-    vocab_size: int = 32_000
     hidden_dim: int = 2048        # d_model: base dimension of the model
     num_layers: int = 22          # number of Transformer blocks
     num_heads: int = 16           # number of Query heads

       - small:  ~100M (for intermediate validation)
       - base:   ~1.1B (final target)
     """
+    vocab_size: int = 32_000        # LLaMA 2 tokenizer vocab size
     hidden_dim: int = 2048        # d_model: base dimension of the model
     num_layers: int = 22          # number of Transformer blocks
     num_heads: int = 16           # number of Query heads

llm_lab/data/pipeline.py CHANGED Viewed

@@ -87,34 +87,18 @@ def train_tokenizer_from_dataset(config: DataConfig) -> Tokenizer:
 def setup_data_pipeline(
-    tokenizer_mode: str = "train_new",
-    tokenizer_path: Optional[str] = None,
     config: Optional[DataConfig] = None,
 ) -> tuple:
     """Sets up the data pipeline in one call.
-    Args:
-        tokenizer_mode:
-            "train_new"    - Train a new BPE tokenizer
-            "load_trained" - Load a previously trained tokenizer
-            "pretrained"   - Use a pretrained HuggingFace tokenizer
-        tokenizer_path:
-            "train_new"    -> Save directory (default: ./tokenizer)
-            "load_trained" -> Path to the saved tokenizer
-            "pretrained"   -> HF model name (default: mistralai/Mistral-7B-v0.1)
     Returns:
         (tokenizer, train_dataloader, val_dataloader)
     Example usage (Colab):
-        # Method 1: Train a new tokenizer
-        tok, train_dl, val_dl = setup_data_pipeline("train_new")
-        # Method 2: Load an existing tokenizer
-        tok, train_dl, val_dl = setup_data_pipeline("load_trained", "./tokenizer")
-        # Method 3: Use a pretrained tokenizer (simplest)
-        tok, train_dl, val_dl = setup_data_pipeline("pretrained")
     """
     config = config or DataConfig()
@@ -124,17 +108,7 @@ def setup_data_pipeline(
     # ── Step 1: Tokenizer ──
     tokenizer = Tokenizer(config)
-    if tokenizer_mode == "train_new":
-        tokenizer = train_tokenizer_from_dataset(config)
-    elif tokenizer_mode == "load_trained":
-        path = tokenizer_path or config.tokenizer_save_dir
-        tokenizer.load_trained_hf(path)
-    elif tokenizer_mode == "pretrained":
-        name = tokenizer_path or "mistralai/Mistral-7B-v0.1"
-        tokenizer.load_pretrained_hf(name)
-    else:
-        raise ValueError(f"Unknown tokenizer_mode: {tokenizer_mode}")
     # ── Step 2: Training DataLoader ──
     print("\n[DataLoader] Creating training DataLoader...")

 def setup_data_pipeline(
     config: Optional[DataConfig] = None,
 ) -> tuple:
     """Sets up the data pipeline in one call.
+    Uses the LLaMA 2 tokenizer (NousResearch/Llama-2-7b-hf) by default.
+    Configurable via DataConfig.tokenizer_name.
     Returns:
         (tokenizer, train_dataloader, val_dataloader)
     Example usage (Colab):
+        tok, train_dl, val_dl = setup_data_pipeline()
     """
     config = config or DataConfig()
     # ── Step 1: Tokenizer ──
     tokenizer = Tokenizer(config)
+    tokenizer.load_pretrained_hf()
     # ── Step 2: Training DataLoader ──
     print("\n[DataLoader] Creating training DataLoader...")

llm_lab/data/tokenizer.py CHANGED Viewed

@@ -137,15 +137,19 @@ class Tokenizer:
     # Method 3: Load a pretrained HF tokenizer
     # ────────────────────────────────────────────────
-    def load_pretrained_hf(self, name_or_path: str = "meta-llama/Llama-2-7b-hf"):
         """Loads a pretrained tokenizer from HuggingFace.
-        The simplest method. The LLaMA tokenizer has a 32K vocab and is BPE-based.
-        Note: meta-llama models may require HF approval to access.
-        Alternative: mistralai/Mistral-7B-v0.1 (no approval required)
         """
         from transformers import AutoTokenizer
         print(f"[Tokenizer] Loading HF tokenizer: {name_or_path}")
         tokenizer = AutoTokenizer.from_pretrained(name_or_path)

     # Method 3: Load a pretrained HF tokenizer
     # ────────────────────────────────────────────────
+    def load_pretrained_hf(self, name_or_path: Optional[str] = None):
         """Loads a pretrained tokenizer from HuggingFace.
+        Default: LLaMA 2 tokenizer (NousResearch/Llama-2-7b-hf mirror).
+          - vocab_size : 32,000
+          - SentencePiece BPE — optimal for 1B-scale models (TinyLlama, LLaMA 1/2)
+          - No HuggingFace authentication required (community mirror)
+        Official source (requires HF auth):
+          - "meta-llama/Llama-2-7b-hf"
         """
         from transformers import AutoTokenizer
+        name_or_path = name_or_path or self.config.tokenizer_name
         print(f"[Tokenizer] Loading HF tokenizer: {name_or_path}")
         tokenizer = AutoTokenizer.from_pretrained(name_or_path)

notebooks/01_data_pipeline.ipynb CHANGED Viewed

@@ -3,21 +3,7 @@
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "# 01. 데이터 파이프라인\n",
-    "\n",
-    "토크나이저 준비 → 데이터 스트리밍 → 시퀀스 패킹 → 배치 구성\n",
-    "\n",
-    "**파이프라인 흐름:**\n",
-    "```\n",
-    "FineWeb-Edu (HuggingFace)\n",
-    "  → Streaming으로 로드 (디스크 저장 없음)\n",
-    "  → 토크나이징 (BPE, vocab=32K)\n",
-    "  → 시퀀스 패킹 (여러 문서를 max_seq_len으로 연결)\n",
-    "  → 배치 구성 (input_ids, targets)\n",
-    "  → GPU 전송\n",
-    "```"
-   ]
   },
   {
    "cell_type": "code",
@@ -59,45 +45,19 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "data_config = DataConfig(\n",
-    "    dataset_name=\"HuggingFaceFW/fineweb-edu\",\n",
-    "    dataset_subset=\"sample-10BT\",\n",
-    "    vocab_size=32_000,\n",
-    "    max_seq_len=2048,\n",
-    "    batch_size=4,\n",
-    "    num_workers=2,\n",
-    ")\n",
-    "\n",
-    "print(f\"데이터셋: {data_config.dataset_name} ({data_config.dataset_subset})\")\n",
-    "print(f\"시퀀스 길이: {data_config.max_seq_len}\")\n",
-    "print(f\"배치 크기: {data_config.batch_size}\")\n",
-    "print(f\"토큰/배치: {data_config.batch_size * data_config.max_seq_len:,}\")"
-   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
-   "source": [
-    "## 2. 토크나이저 설정\n",
-    "\n",
-    "세 가지 방법 중 선택:\n",
-    "- `\"pretrained\"` — HuggingFace 사전학습 토크나이저 (가장 간편)\n",
-    "- `\"train_new\"` — BPE 토크나이저 새로 학습\n",
-    "- `\"load_trained\"` — 이전에 학습한 토크나이저 로드"
-   ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "tokenizer, train_dl, val_dl = setup_data_pipeline(\n",
-    "    tokenizer_mode=\"pretrained\",  # \"train_new\" 또는 \"load_trained\"로 변경 가능\n",
-    "    config=data_config,\n",
-    ")"
-   ]
   },
   {
    "cell_type": "markdown",
@@ -166,4 +126,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}

   {
    "cell_type": "markdown",
    "metadata": {},
+   "source": "# 01. 데이터 파이프라인\n\n토크나이저 준비 → 데이터 스트리밍 → 시퀀스 패킹 → 배치 구성\n\n**파이프라인 흐름:**\n```\nFineWeb-Edu (HuggingFace)\n  → Streaming으로 로드 (디스크 저장 없음)\n  → 토크나이징 (LLaMA 2 BPE, vocab=32K)\n  → 시퀀스 패킹 (여러 문서를 max_seq_len으로 연결)\n  → 배치 구성 (input_ids, targets)\n  → GPU 전송\n```"
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": "data_config = DataConfig(\n    dataset_name=\"HuggingFaceFW/fineweb-edu\",\n    dataset_subset=\"sample-10BT\",\n    max_seq_len=2048,\n    batch_size=4,\n    num_workers=2,\n)\n\nprint(f\"데이터셋: {data_config.dataset_name} ({data_config.dataset_subset})\")\nprint(f\"토크나이저: {data_config.tokenizer_name} (vocab_size={data_config.vocab_size:,})\")\nprint(f\"시퀀스 길이: {data_config.max_seq_len}\")\nprint(f\"배치 크기: {data_config.batch_size}\")\nprint(f\"토큰/배치: {data_config.batch_size * data_config.max_seq_len:,}\")"
   },
   {
    "cell_type": "markdown",
    "metadata": {},
+   "source": "## 2. 토크나이저 + 데이터 파이프라인 설정\n\n**LLaMA 2 토크나이저** (`NousResearch/Llama-2-7b-hf`)를 사용합니다.\n- vocab_size: 32,000\n- SentencePiece BPE — 1B급 모델(TinyLlama, LLaMA 1/2)의 표준 토크나이저\n- HuggingFace 인증 없이 사용 가능한 공식 미러"
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": "tokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
   },
   {
    "cell_type": "markdown",
  },
  "nbformat": 4,
  "nbformat_minor": 4
+}

notebooks/03_training.ipynb CHANGED Viewed

@@ -110,17 +110,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# 모델 생성\n",
-    "model = LLMModel(model_config)\n",
-    "print(f\"모델 파라미터: {model.count_parameters():,}\")\n",
-    "\n",
-    "# 데이터 파이프라인\n",
-    "tokenizer, train_dl, val_dl = setup_data_pipeline(\n",
-    "    tokenizer_mode=\"pretrained\",\n",
-    "    config=data_config,\n",
-    ")"
-   ]
   },
   {
    "cell_type": "markdown",

    "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": "# 모델 생성\nmodel = LLMModel(model_config)\nprint(f\"모델 파라미터: {model.count_parameters():,}\")\n\n# 데이터 파이프라인 (GPT-2 토크나이저 자동 사용)\ntokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
   },
   {
    "cell_type": "markdown",

notebooks/04_evaluation.ipynb CHANGED Viewed

@@ -85,14 +85,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "from llm_lab.data import setup_data_pipeline\n",
-    "\n",
-    "tokenizer, train_dl, val_dl = setup_data_pipeline(tokenizer_mode=\"train_new\")"
-   ]
   },
   {
    "cell_type": "code",
@@ -198,4 +194,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": "from llm_lab.data import setup_data_pipeline\n\ntokenizer, train_dl, val_dl = setup_data_pipeline()"
   },
   {
    "cell_type": "code",
  },
  "nbformat": 4,
  "nbformat_minor": 4
+}

notebooks/05_debugging.ipynb CHANGED Viewed

@@ -101,17 +101,7 @@
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# --- Model ---\n",
-    "model = LLMModel(model_config).to(device)\n",
-    "print(f\"Model parameters: {model.count_parameters():,}\")\n",
-    "\n",
-    "# --- Data pipeline ---\n",
-    "tokenizer, train_dl, val_dl = setup_data_pipeline(\n",
-    "    tokenizer_mode=\"pretrained\",\n",
-    "    config=data_config,\n",
-    ")"
-   ]
   },
   {
    "cell_type": "markdown",

    "execution_count": null,
    "metadata": {},
    "outputs": [],
+   "source": "# --- Model ---\nmodel = LLMModel(model_config).to(device)\nprint(f\"Model parameters: {model.count_parameters():,}\")\n\n# --- Data pipeline (GPT-2 tokenizer 자동 사용) ---\ntokenizer, train_dl, val_dl = setup_data_pipeline(config=data_config)"
   },
   {
    "cell_type": "markdown",