feat: add authenticated remote control UI and ngrok launcher

Browse files

Files changed (14) hide show

README.md +14 -4
data/bootstrap.py +105 -0
data/pipeline.py +93 -0
docs/COMMANDS.md +129 -18
scripts/run_data_pipeline.sh +1 -1
serve/control_plane.py +80 -3
serve/server.py +142 -17
serve/server_cpu.py +29 -1
serve/static/index.html +121 -1
tests/test_control_plane.py +13 -0
tests/test_data_pipeline.py +52 -0
tests/test_servers.py +44 -0
tests/test_tokenizer.py +20 -0
tokenizer/train_tokenizer.py +32 -13

README.md CHANGED Viewed

@@ -6,8 +6,16 @@ Designed to be both educational and functional, SAGE can be trained, fine-tuned,
 ---
-## ☁️ Cloud Quickstart (Kaggle / Colab)
-Running SAGE in the cloud? Check out the **[Kaggle & Colab Quickstart Guide](file:///c:/Users/Lenovo/OneDrive/Desktop/Documents/LLM_MOdel/SAGE_KAGGLE_GUIDE.md)** for one-click setup and a premium interactive chat interface.
 ---
@@ -92,11 +100,13 @@ Once launched, simply type your message to chat with SAGE. The system uses a rol
 The FastAPI server now serves a minimal remote control panel at `/`.
 ```bash
-export SAGE_WEB_PASSWORD=change-me
 python -m uvicorn serve.server:app --host 0.0.0.0 --port 8000
 ```
-Open the server root in a browser, log in with `SAGE_WEB_PASSWORD`, and use preset actions or a raw command box to drive the local repo from the UI. The included `test.ipynb` notebook starts the real app and exposes it through ngrok for Colab.
 ---

 ---
+## Command Guide
+Use the repo command sheet at `docs/COMMANDS.md` for the current end-to-end flow:
+- bootstrap starter JSONL data
+- train the tokenizer
+- build parquet shards
+- launch training
+- serve the model
+- use the browser UI and chat endpoint
 ---
 The FastAPI server now serves a minimal remote control panel at `/`.
 ```bash
+$env:SAGE_WEB_PASSWORD="change-me"
+$env:SAGE_CHECKPOINT_DIR="runs/sage-1b"
+$env:SAGE_TOKENIZER_MODEL="tokenizer/tokenizer.model"
 python -m uvicorn serve.server:app --host 0.0.0.0 --port 8000
 ```
+Open the server root in a browser, log in with `SAGE_WEB_PASSWORD`, and use preset actions or a raw command box to drive the local repo from the UI. The browser UI now also includes direct text chat through `/chat` when a tokenizer is available. If no checkpoint is loaded yet, the UI warns that outputs are coming from randomly initialized weights. The included `test.ipynb` notebook starts the real app and exposes it through ngrok for Colab.
 ---

data/bootstrap.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""Bootstrap small raw corpora for tokenizer and smoke-training flows."""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+BOOTSTRAP_CORPORA: dict[str, list[str]] = {
+    "general_web": [
+        "Large language models learn by predicting the next token in a sequence, but useful systems depend just as much on data quality as on architecture size.",
+        "A good training corpus mixes clean prose, documentation, dialogue, and reference material so the model sees multiple ways humans structure information.",
+        "When you build a local model, start with small smoke runs, measure loss curves, and only then scale sequence length, batch size, and parameter count.",
+        "The fastest way to waste compute is to train on noisy duplicated text without checking tokenization, filtering, and validation splits first.",
+        "Evaluation should include both regression tests and qualitative prompts because perplexity alone does not tell you whether a model follows instructions well.",
+        "A serving stack usually needs checkpoint loading, tokenization, generation settings, and telemetry before it is practical for iterative experiments.",
+    ],
+    "code": [
+        "def running_mean(values):\n    total = 0.0\n    result = []\n    for index, value in enumerate(values, start=1):\n        total += value\n        result.append(total / index)\n    return result",
+        "class TextBatch:\n    def __init__(self, items):\n        self.items = list(items)\n\n    def join(self, sep='\\n'):\n        return sep.join(self.items)",
+        "from pathlib import Path\n\ndef read_text(path):\n    return Path(path).read_text(encoding='utf-8')",
+        "def clamp(value, lo, hi):\n    if value < lo:\n        return lo\n    if value > hi:\n        return hi\n    return value",
+        "def format_metrics(step, loss):\n    return f'step={step} loss={loss:.4f}'",
+        "def greedy_decode(logits):\n    import torch\n    return int(torch.argmax(logits, dim=-1).item())",
+    ],
+    "math_science": [
+        "The derivative of x squared is 2x, and gradient-based optimization uses derivatives to decide how to update model parameters.",
+        "Perplexity is the exponential of average negative log likelihood; lower perplexity means the model assigns higher probability to the observed sequence.",
+        "If a batch contains B sequences of length T, then the number of next-token predictions is roughly B times T.",
+        "Matrix multiplication is central to transformer inference because projections for queries, keys, values, and feed-forward layers are all linear maps.",
+        "Softmax converts raw logits into a probability distribution by exponentiating each value and dividing by the sum of exponentials.",
+        "The context window bounds how many previous tokens the decoder can attend to while producing the next token.",
+    ],
+    "multilingual": [
+        "English: Training data should be filtered, deduplicated, and documented before long runs begin.",
+        "Hindi: अच्छे मॉडल के लिए साफ और विविध डेटा उतना ही जरूरी है जितना अच्छा आर्किटेक्चर।",
+        "Arabic: جودة البيانات تؤثر على جودة النموذج بقدر تأثير حجم النموذج نفسه.",
+        "Chinese: 在开始长时间训练之前，先做小规模验证可以节省大量计算资源。",
+        "Spanish: Un buen flujo de datos incluye limpieza, deduplicacion y particiones reproducibles.",
+        "French: Un modele utile demande des donnees propres, des tests et une boucle d'evaluation simple.",
+    ],
+    "synthetic": [
+        "[INST] Explain why deduplication matters before tokenizer training. [/INST] Deduplication prevents repeated passages from dominating merge statistics and reduces wasted compute during later model training.",
+        "[INST] Write a short checklist for a smoke training run. [/INST] Verify shards exist, verify tokenizer loads, run a short job, inspect metrics, and confirm checkpoints are written.",
+        "[INST] How do you know a dataset is too noisy? [/INST] Look for low alpha ratios, malformed markup, repeated content, excessive boilerplate, or corrupted encoding.",
+        "[INST] What is the purpose of a validation split? [/INST] It gives you held-out data for tracking generalization and for catching regressions during training.",
+        "[INST] Summarize the role of the tokenizer. [/INST] The tokenizer maps raw text into stable token ids the model can consume during training and generation.",
+        "[INST] Why keep metadata with each record? [/INST] Metadata helps audit provenance, quality, language mix, and filtering decisions across the pipeline.",
+    ],
+}
+def _pad_sample(text: str, minimum_chars: int = 240) -> str:
+    """Extend short bootstrap samples so they survive the default filters."""
+    trailer = (
+        " This bootstrap record is intentionally longer so the repo's default "
+        "quality filters keep it during smoke-test data preparation and tokenizer training."
+    )
+    padded = text.strip()
+    while len(padded) < minimum_chars:
+        padded += trailer
+    return padded
+def bootstrap_raw_corpora(output_dir: str = "data/raw", overwrite: bool = False) -> dict[str, int]:
+    """Write one small JSONL corpus file per registered source."""
+    root = Path(output_dir)
+    root.mkdir(parents=True, exist_ok=True)
+    counts: dict[str, int] = {}
+    for source_name, samples in BOOTSTRAP_CORPORA.items():
+        path = root / f"{source_name}.jsonl"
+        if path.exists() and not overwrite:
+            existing = sum(1 for _ in path.open("r", encoding="utf-8"))
+            counts[source_name] = existing
+            continue
+        with path.open("w", encoding="utf-8") as handle:
+            for index, text in enumerate(samples, start=1):
+                payload = {
+                    "id": f"{source_name}-{index:04d}",
+                    "text": _pad_sample(text),
+                    "source_name": source_name,
+                }
+                handle.write(json.dumps(payload, ensure_ascii=False) + "\n")
+        counts[source_name] = len(samples)
+    return counts
+def build_argparser() -> argparse.ArgumentParser:
+    """Build the CLI parser for corpus bootstrapping."""
+    parser = argparse.ArgumentParser(description="Create small JSONL corpora for SAGE smoke runs.")
+    parser.add_argument("--output-dir", default="data/raw", help="Directory for raw JSONL corpus files.")
+    parser.add_argument("--overwrite", action="store_true", help="Replace any existing bootstrap corpus files.")
+    return parser
+def main() -> None:
+    """CLI entrypoint."""
+    args = build_argparser().parse_args()
+    summary = bootstrap_raw_corpora(output_dir=args.output_dir, overwrite=args.overwrite)
+    print(json.dumps({"output_dir": args.output_dir, "sources": summary}, indent=2, ensure_ascii=False))
+if __name__ == "__main__":
+    main()

data/pipeline.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""End-to-end raw-corpus to Parquet shard pipeline."""
+from __future__ import annotations
+import argparse
+import json
+from pathlib import Path
+import sentencepiece as spm
+from data.dedup import deduplicate_records
+from data.filter import FilterConfig, filter_record
+from data.ingest import SOURCE_REGISTRY, stream_source
+from data.shard import ShardConfig, write_shards
+def _select_sources(names: list[str] | None) -> tuple:
+    if not names:
+        return SOURCE_REGISTRY
+    wanted = set(names)
+    selected = tuple(spec for spec in SOURCE_REGISTRY if spec.name in wanted)
+    missing = sorted(wanted - {spec.name for spec in selected})
+    if missing:
+        raise ValueError(f"Unknown sources: {', '.join(missing)}")
+    return selected
+def build_records(source_names: list[str] | None = None, limit_per_source: int | None = None) -> list[dict[str, object]]:
+    """Load, filter, and deduplicate records from the configured raw sources."""
+    records: list[dict[str, object]] = []
+    for spec in _select_sources(source_names):
+        source_records: list[dict[str, object]] = []
+        for record in stream_source(spec):
+            filtered = filter_record(record, FilterConfig())
+            if filtered is None:
+                continue
+            source_records.append(filtered)
+            if limit_per_source is not None and len(source_records) >= limit_per_source:
+                break
+        records.extend(source_records)
+    return deduplicate_records(records)
+def run_pipeline(
+    tokenizer_model: str,
+    output_dir: str = "data/processed",
+    source_names: list[str] | None = None,
+    shard_size: int = 2048,
+    limit_per_source: int | None = None,
+) -> dict[str, object]:
+    """Create Parquet shards from the current raw JSONL corpora."""
+    tokenizer = spm.SentencePieceProcessor()
+    tokenizer.load(tokenizer_model)
+    records = build_records(source_names=source_names, limit_per_source=limit_per_source)
+    manifest = write_shards(records, tokenizer, ShardConfig(output_dir=output_dir, shard_size=shard_size))
+    summary = {
+        "tokenizer_model": tokenizer_model,
+        "output_dir": output_dir,
+        "records": len(records),
+        "sources": source_names or [spec.name for spec in SOURCE_REGISTRY],
+        "manifest": manifest,
+    }
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    (Path(output_dir) / "pipeline_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")
+    return summary
+def build_argparser() -> argparse.ArgumentParser:
+    """Build the CLI parser."""
+    parser = argparse.ArgumentParser(description="Filter, deduplicate, tokenize, and shard SAGE raw corpora.")
+    parser.add_argument("--tokenizer-model", default="tokenizer/tokenizer.model", help="SentencePiece tokenizer model.")
+    parser.add_argument("--output-dir", default="data/processed", help="Destination directory for parquet shards.")
+    parser.add_argument("--sources", nargs="*", default=None, help="Subset of source names from data.ingest.SOURCE_REGISTRY.")
+    parser.add_argument("--shard-size", type=int, default=2048, help="Rows per parquet shard.")
+    parser.add_argument("--limit-per-source", type=int, default=None, help="Optional cap for smoke-testing.")
+    return parser
+def main() -> None:
+    """CLI entrypoint."""
+    args = build_argparser().parse_args()
+    summary = run_pipeline(
+        tokenizer_model=args.tokenizer_model,
+        output_dir=args.output_dir,
+        source_names=args.sources,
+        shard_size=args.shard_size,
+        limit_per_source=args.limit_per_source,
+    )
+    print(json.dumps(summary, indent=2))
+if __name__ == "__main__":
+    main()

docs/COMMANDS.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # SAGE Commands
-This file is the short command-only reference for the repo.
 ## Install
@@ -14,25 +14,81 @@ pip install -r requirements.txt
 pytest -q
 ```
-## Train tokenizer
 ```bash
 python -m tokenizer.train_tokenizer \
-  --input data/raw/general_web.txt data/raw/code.txt \
-  --model-prefix tokenizer/tokenizer \
   --model-prefix tokenizer/tokenizer \
 ```
-## Validate tokenizer
 ```bash
-bash scripts/run_validate_tokenizer.sh tokenizer/tokenizer.model
 ```
-## Start a short training smoke run
 ```bash
 python -m train.trainer \
   --train-shards data/processed/shard-00000.parquet \
   --validation-shards data/processed/shard-00001.parquet \
   --output-dir runs/smoke \
@@ -40,7 +96,7 @@ python -m train.trainer \
   --disable-wandb
 ```
-## Start full training
 ```bash
 python -m train.trainer \
@@ -51,38 +107,93 @@ python -m train.trainer \
   --output-dir runs/sage-1b
 ```
-## Run eval harness
 ```bash
-bash scripts/run_eval.sh
 ```
-## Start GPU server
 ```bash
-export SAGE_WEB_PASSWORD=change-me
-bash scripts/run_serve.sh
 ```
-## Start CPU server
 ```bash
-export SAGE_WEB_PASSWORD=change-me
 bash scripts/run_serve_cpu.sh
 ```
-The server root now hosts the browser control panel at `/`. Log in with `SAGE_WEB_PASSWORD`, then use presets or raw commands from the UI.
-## Check server health
 ```bash
 curl http://127.0.0.1:8000/health
 ```
-## Generate tokens from the API
 ```bash
 curl -X POST http://127.0.0.1:8000/generate \
   -H "Content-Type: application/json" \
   -d "{\"input_ids\": [1, 42, 99], \"max_new_tokens\": 8}"
 ```

 # SAGE Commands
+This is the repo's current command reference for data preparation, tokenizer training, model training, serving, browser control, and validation.
 ## Install
 pytest -q
 ```
+## 1. Create a starter dataset
+This repo does not ship a large training corpus. The fastest way to unblock the pipeline is to generate the built-in smoke dataset first:
+```bash
+python -m data.bootstrap --output-dir data/raw --overwrite
+```
+That writes JSONL files like:
+```text
+data/raw/general_web.jsonl
+data/raw/code.jsonl
+data/raw/math_science.jsonl
+data/raw/multilingual.jsonl
+data/raw/synthetic.jsonl
+```
+If you want to use your own corpus, put JSONL records in the same folder with at least a `text` field:
+```json
+{"text": "your training sample here"}
+```
+## 2. Train the tokenizer
+The tokenizer trainer now accepts plain text files or JSONL files.
 ```bash
 python -m tokenizer.train_tokenizer \
+  --input data/raw/general_web.jsonl data/raw/code.jsonl data/raw/math_science.jsonl data/raw/multilingual.jsonl data/raw/synthetic.jsonl \
   --model-prefix tokenizer/tokenizer \
+  --vocab-size 4096 \
+  --training-text tokenizer/training_corpus.txt
+```
+## 3. Validate the tokenizer
+```bash
+python -m tokenizer.validate_tokenizer tokenizer/tokenizer.model
+```
+## 4. Build parquet shards
+```bash
+python -m data.pipeline \
+  --tokenizer-model tokenizer/tokenizer.model \
+  --output-dir data/processed \
+  --shard-size 128
 ```
+For a short smoke run:
 ```bash
+python -m data.pipeline \
+  --tokenizer-model tokenizer/tokenizer.model \
+  --output-dir data/processed \
+  --shard-size 32 \
+  --limit-per-source 4
 ```
+The shell helper now points to the real data pipeline:
+```bash
+bash scripts/run_data_pipeline.sh --tokenizer-model tokenizer/tokenizer.model --output-dir data/processed
+```
+## 5. Start training
+Smoke run:
 ```bash
 python -m train.trainer \
+  --model-config configs/model/1b.yaml \
+  --schedule-config configs/train/schedule.yaml \
   --train-shards data/processed/shard-00000.parquet \
   --validation-shards data/processed/shard-00001.parquet \
   --output-dir runs/smoke \
   --disable-wandb
 ```
+Longer run:
 ```bash
 python -m train.trainer \
   --output-dir runs/sage-1b
 ```
+## 6. Serve the model
+GPU/PyTorch server:
 ```bash
+$env:SAGE_WEB_PASSWORD="change-me"
+$env:SAGE_CHECKPOINT_DIR="runs/sage-1b"
+$env:SAGE_TOKENIZER_MODEL="tokenizer/tokenizer.model"
+python -m uvicorn serve.server:app --host 0.0.0.0 --port 8000
 ```
+CPU control-plane server:
 ```bash
+$env:SAGE_WEB_PASSWORD="change-me"
+python -m uvicorn serve.server_cpu:app --host 0.0.0.0 --port 8001
 ```
+Helper scripts:
 ```bash
+bash scripts/run_serve.sh
 bash scripts/run_serve_cpu.sh
 ```
+## 7. Browser control panel
+Open the server root:
+```text
+http://127.0.0.1:8000/
+```
+The browser UI now supports:
+- login with `SAGE_WEB_PASSWORD`
+- dataset bootstrap preset
+- shard-building preset
+- tokenizer/train/eval/server presets
+- raw shell commands
+- live job logs
+- direct model chat through `/chat`
+## 8. API commands
+Health:
 ```bash
 curl http://127.0.0.1:8000/health
 ```
+Generate from token ids:
 ```bash
 curl -X POST http://127.0.0.1:8000/generate \
   -H "Content-Type: application/json" \
   -d "{\"input_ids\": [1, 42, 99], \"max_new_tokens\": 8}"
 ```
+Chat from text:
+```bash
+curl -X POST http://127.0.0.1:8000/chat \
+  -H "Content-Type: application/json" \
+  -d "{\"prompt\": \"Explain the training flow in this repo.\", \"max_new_tokens\": 64}"
+```
+Chat status:
+```bash
+curl http://127.0.0.1:8000/chat/status
+```
+## 9. Evaluation
+```bash
+python -m eval.run_benchmarks
+```
+Or use the helper:
+```bash
+bash scripts/run_eval.sh
+```
+## 10. Hugging Face sync
+```bash
+python hf_push.py
+```

scripts/run_data_pipeline.sh CHANGED Viewed

@@ -1,4 +1,4 @@
 #!/usr/bin/env bash
 set -euo pipefail
-python -m tokenizer.train_tokenizer "$@"

 #!/usr/bin/env bash
 set -euo pipefail
+python -m data.pipeline "$@"

serve/control_plane.py CHANGED Viewed

@@ -158,6 +158,34 @@ def _build_presets(enable_generate: bool) -> list[CommandPreset]:
             "Call the local /health API and show the JSON response.",
             "api",
         ),
         CommandPreset(
             "serve_gpu",
             "Serve GPU",
@@ -188,7 +216,7 @@ def _build_presets(enable_generate: bool) -> list[CommandPreset]:
                     "input_paths",
                     "Input Paths",
                     kind="textarea",
-                    placeholder="data/raw/general_web.txt\ndata/raw/code.txt",
                     required=True,
                 ),
                 PresetField("model_prefix", "Model Prefix", default="tokenizer/tokenizer"),
@@ -472,7 +500,49 @@ def _api_response(handler: Callable[[dict[str, Any]], dict[str, Any]], args: dic
     return {"kind": "api", "result": handler(args)}
 def _build_command_for_preset(preset_id: str, args: dict[str, Any]) -> list[str] | str:
     if preset_id == "serve_gpu":
         return [
             sys.executable,
@@ -607,6 +677,7 @@ def build_control_router(api_handlers: dict[str, Callable[[dict[str, Any]], dict
             preset = preset_map.get(payload.preset_id)
             if preset is None:
                 raise HTTPException(status_code=404, detail=f"Unknown preset: {payload.preset_id}")
             if preset.mode == "api":
                 handler = api_handlers.get(payload.preset_id)
                 if handler is None:
@@ -614,11 +685,17 @@ def build_control_router(api_handlers: dict[str, Callable[[dict[str, Any]], dict
                 return _api_response(handler, payload.args)
             command = _build_command_for_preset(payload.preset_id, payload.args)
             mode = "shell" if isinstance(command, str) else "job"
-            job = CONTROL_MANAGER.start_job(preset.label, command, cwd=str(REPO_ROOT), mode=mode)
             return {"kind": "job", "job": job.to_dict()}
         if payload.command:
             cwd = payload.cwd or str(REPO_ROOT)
-            job = CONTROL_MANAGER.start_job("Raw Command", payload.command, cwd=cwd, mode="shell")
             return {"kind": "job", "job": job.to_dict()}
         raise HTTPException(status_code=400, detail="Provide either preset_id or command.")

             "Call the local /health API and show the JSON response.",
             "api",
         ),
+        CommandPreset(
+            "data_bootstrap",
+            "Bootstrap Dataset",
+            "Create small JSONL corpora under data/raw for tokenizer and smoke-training runs.",
+            "job",
+            (
+                PresetField("output_dir", "Output Dir", default="data/raw"),
+                PresetField("overwrite", "Overwrite Existing Files", kind="boolean", default=False),
+            ),
+        ),
+        CommandPreset(
+            "data_pipeline",
+            "Build Data Shards",
+            "Filter raw JSONL corpora, deduplicate them, then write parquet shards with the trained tokenizer.",
+            "job",
+            (
+                PresetField("tokenizer_model", "Tokenizer Model", default="tokenizer/tokenizer.model"),
+                PresetField("output_dir", "Output Dir", default="data/processed"),
+                PresetField(
+                    "sources",
+                    "Sources",
+                    kind="textarea",
+                    placeholder="general_web\ncode\nmath_science\nmultilingual\nsynthetic",
+                ),
+                PresetField("shard_size", "Shard Size", kind="number", default=2048),
+                PresetField("limit_per_source", "Limit Per Source", kind="number", default=0),
+            ),
+        ),
         CommandPreset(
             "serve_gpu",
             "Serve GPU",
                     "input_paths",
                     "Input Paths",
                     kind="textarea",
+                    placeholder="data/raw/general_web.jsonl\ndata/raw/code.jsonl",
                     required=True,
                 ),
                 PresetField("model_prefix", "Model Prefix", default="tokenizer/tokenizer"),
     return {"kind": "api", "result": handler(args)}
+def _validate_preset_args(preset: CommandPreset, args: dict[str, Any]) -> None:
+    missing: list[str] = []
+    for field in preset.fields:
+        if not field.required:
+            continue
+        value = args.get(field.name)
+        if value is None:
+            missing.append(field.label)
+            continue
+        if isinstance(value, str) and not value.strip():
+            missing.append(field.label)
+            continue
+        if isinstance(value, list) and not value:
+            missing.append(field.label)
+    if missing:
+        raise HTTPException(status_code=400, detail=f"Missing required fields: {', '.join(missing)}")
 def _build_command_for_preset(preset_id: str, args: dict[str, Any]) -> list[str] | str:
+    if preset_id == "data_bootstrap":
+        command = [sys.executable, "-m", "data.bootstrap", "--output-dir", str(args.get("output_dir") or "data/raw")]
+        if bool(args.get("overwrite", False)):
+            command.append("--overwrite")
+        return command
+    if preset_id == "data_pipeline":
+        command = [
+            sys.executable,
+            "-m",
+            "data.pipeline",
+            "--tokenizer-model",
+            str(args.get("tokenizer_model") or "tokenizer/tokenizer.model"),
+            "--output-dir",
+            str(args.get("output_dir") or "data/processed"),
+            "--shard-size",
+            str(_parse_number(args.get("shard_size"), 2048)),
+        ]
+        sources = _split_multi_value(args.get("sources"))
+        if sources:
+            command.extend(["--sources", *sources])
+        limit_per_source = _parse_number(args.get("limit_per_source"), 0)
+        if limit_per_source > 0:
+            command.extend(["--limit-per-source", str(limit_per_source)])
+        return command
     if preset_id == "serve_gpu":
         return [
             sys.executable,
             preset = preset_map.get(payload.preset_id)
             if preset is None:
                 raise HTTPException(status_code=404, detail=f"Unknown preset: {payload.preset_id}")
+            _validate_preset_args(preset, payload.args)
             if preset.mode == "api":
                 handler = api_handlers.get(payload.preset_id)
                 if handler is None:
                 return _api_response(handler, payload.args)
             command = _build_command_for_preset(payload.preset_id, payload.args)
             mode = "shell" if isinstance(command, str) else "job"
+            try:
+                job = CONTROL_MANAGER.start_job(preset.label, command, cwd=str(REPO_ROOT), mode=mode)
+            except OSError as exc:
+                raise HTTPException(status_code=400, detail=str(exc)) from exc
             return {"kind": "job", "job": job.to_dict()}
         if payload.command:
             cwd = payload.cwd or str(REPO_ROOT)
+            try:
+                job = CONTROL_MANAGER.start_job("Raw Command", payload.command, cwd=cwd, mode="shell")
+            except OSError as exc:
+                raise HTTPException(status_code=400, detail=str(exc)) from exc
             return {"kind": "job", "job": job.to_dict()}
         raise HTTPException(status_code=400, detail="Provide either preset_id or command.")

serve/server.py CHANGED Viewed

@@ -2,7 +2,9 @@
 from __future__ import annotations
-from typing import Optional
 import torch
 from fastapi import FastAPI
@@ -11,13 +13,21 @@ from pydantic import BaseModel
 from model.config import ModelConfig
 from model.model import SageTransformer
 from serve.control_plane import build_control_router
-from serve.kv_cache import KVCache
 from train.hardware import HardwareConfig
 app = FastAPI(title="SAGE Server")
 _MODEL: SageTransformer | None = None
 _TOKENIZER = None
 class GenerationRequest(BaseModel):
@@ -27,37 +37,152 @@ class GenerationRequest(BaseModel):
     max_new_tokens: int = 32
 def get_model() -> SageTransformer:
     """Lazily create the model for server startup."""
-    global _MODEL
     if _MODEL is None:
-        _MODEL = SageTransformer(ModelConfig())
         _MODEL.eval()
     return _MODEL
 @app.get("/health")
 def health() -> dict[str, object]:
     """Return basic health and hardware information."""
     hw = HardwareConfig(model_size_b=1.0, context_length=4096)
-    return {"status": "ok", "hardware": hw.summary()}
 @app.post("/generate")
 def generate(request: GenerationRequest) -> dict[str, object]:
     """Generate continuation token ids from an input token list."""
-    model = get_model()
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    model.to(device)
-    input_ids = torch.tensor([request.input_ids], dtype=torch.long, device=device)
-    generated = list(request.input_ids)
-    cache: Optional[list[tuple[torch.Tensor, torch.Tensor]]] = None
-    for _ in range(request.max_new_tokens):
-        logits, cache = model(input_ids[:, -1:] if cache is not None else input_ids, past_key_values=cache)
-        next_token = int(torch.argmax(logits[:, -1, :], dim=-1).item())
-        generated.append(next_token)
-        input_ids = torch.tensor([[next_token]], dtype=torch.long, device=device)
-    return {"tokens": generated}
 def _health_action(_: dict[str, object]) -> dict[str, object]:

 from __future__ import annotations
+import os
+from pathlib import Path
+from typing import Any, Optional
 import torch
 from fastapi import FastAPI
 from model.config import ModelConfig
 from model.model import SageTransformer
 from serve.control_plane import build_control_router
+from train.checkpoint import load_latest_checkpoint
 from train.hardware import HardwareConfig
 app = FastAPI(title="SAGE Server")
 _MODEL: SageTransformer | None = None
 _TOKENIZER = None
+_MODEL_DEVICE: torch.device | None = None
+_MODEL_STATE: dict[str, Any] = {
+    "model_config": None,
+    "checkpoint_dir": None,
+    "checkpoint_loaded": False,
+    "checkpoint_step": 0,
+    "tokenizer_path": None,
+}
 class GenerationRequest(BaseModel):
     max_new_tokens: int = 32
+class ChatRequest(BaseModel):
+    """Request schema for text generation through the tokenizer."""
+    prompt: str
+    max_new_tokens: int = 64
+def get_generation_device() -> torch.device:
+    """Pick the active inference device."""
+    return torch.device("cuda" if torch.cuda.is_available() else "cpu")
+def _resolve_model_config_path() -> Path:
+    configured = Path(os.environ.get("SAGE_MODEL_CONFIG", "configs/model/1b.yaml"))
+    return configured if configured.exists() else Path("configs/model/1b.yaml")
+def _resolve_checkpoint_dir() -> Path:
+    return Path(os.environ.get("SAGE_CHECKPOINT_DIR", "runs/default"))
+def _resolve_tokenizer_path() -> Path:
+    return Path(os.environ.get("SAGE_TOKENIZER_MODEL", "tokenizer/tokenizer.model"))
 def get_model() -> SageTransformer:
     """Lazily create the model for server startup."""
+    global _MODEL, _MODEL_DEVICE
     if _MODEL is None:
+        config_path = _resolve_model_config_path()
+        config = ModelConfig.from_yaml(config_path) if config_path.exists() else ModelConfig()
+        _MODEL = SageTransformer(config)
+        checkpoint_dir = _resolve_checkpoint_dir()
+        checkpoint_step = 0
+        if checkpoint_dir.exists():
+            checkpoint_step = load_latest_checkpoint(_MODEL, None, None, None, str(checkpoint_dir), device="cpu")
+        _MODEL_STATE.update(
+            {
+                "model_config": str(config_path),
+                "checkpoint_dir": str(checkpoint_dir),
+                "checkpoint_loaded": checkpoint_step > 0,
+                "checkpoint_step": checkpoint_step,
+            }
+        )
         _MODEL.eval()
+    device = get_generation_device()
+    if _MODEL_DEVICE != device:
+        _MODEL = _MODEL.to(device)
+        _MODEL_DEVICE = device
     return _MODEL
+def get_tokenizer():
+    """Lazily load the SentencePiece tokenizer if present."""
+    global _TOKENIZER
+    if _TOKENIZER is None:
+        tokenizer_path = _resolve_tokenizer_path()
+        _MODEL_STATE["tokenizer_path"] = str(tokenizer_path)
+        if not tokenizer_path.exists():
+            return None
+        from tokenizer.validate_tokenizer import load_processor
+        _TOKENIZER = load_processor(str(tokenizer_path))
+    return _TOKENIZER
+def _generate_token_ids(input_ids: list[int], max_new_tokens: int) -> list[int]:
+    """Run greedy decoding from input token ids."""
+    model = get_model()
+    device = get_generation_device()
+    generated = list(input_ids)
+    tensor_ids = torch.tensor([input_ids], dtype=torch.long, device=device)
+    cache: Optional[list[tuple[torch.Tensor, torch.Tensor]]] = None
+    with torch.inference_mode():
+        for _ in range(max(0, int(max_new_tokens))):
+            logits, cache = model(tensor_ids[:, -1:] if cache is not None else tensor_ids, past_key_values=cache)
+            next_token = int(torch.argmax(logits[:, -1, :], dim=-1).item())
+            generated.append(next_token)
+            tensor_ids = torch.tensor([[next_token]], dtype=torch.long, device=device)
+    return generated
+def chat_status() -> dict[str, object]:
+    """Return whether text chat is configured for the current server."""
+    tokenizer = get_tokenizer()
+    checkpoint_loaded = bool(_MODEL_STATE["checkpoint_loaded"])
+    available = tokenizer is not None
+    warning = None
+    if tokenizer is None:
+        warning = "Tokenizer model not found. Train or place tokenizer/tokenizer.model before using browser chat."
+    elif not checkpoint_loaded:
+        warning = "No checkpoint loaded. Chat will run with randomly initialized model weights until you train or configure SAGE_CHECKPOINT_DIR."
+    return {
+        "available": available,
+        "tokenizer_path": _MODEL_STATE["tokenizer_path"],
+        "checkpoint_dir": _MODEL_STATE["checkpoint_dir"],
+        "checkpoint_loaded": checkpoint_loaded,
+        "checkpoint_step": _MODEL_STATE["checkpoint_step"],
+        "warning": warning,
+    }
 @app.get("/health")
 def health() -> dict[str, object]:
     """Return basic health and hardware information."""
     hw = HardwareConfig(model_size_b=1.0, context_length=4096)
+    return {"status": "ok", "hardware": hw.summary(), "chat": chat_status()}
 @app.post("/generate")
 def generate(request: GenerationRequest) -> dict[str, object]:
     """Generate continuation token ids from an input token list."""
+    return {"tokens": _generate_token_ids(request.input_ids, request.max_new_tokens)}
+@app.get("/chat/status")
+def get_chat_status() -> dict[str, object]:
+    """Expose browser-chat readiness."""
+    return chat_status()
+@app.post("/chat")
+def chat(request: ChatRequest) -> dict[str, object]:
+    """Generate text from a prompt using the local tokenizer."""
+    tokenizer = get_tokenizer()
+    if tokenizer is None:
+        return {
+            "success": False,
+            "detail": "Tokenizer model not found. Train the tokenizer first or set SAGE_TOKENIZER_MODEL.",
+            **chat_status(),
+        }
+    prompt = request.prompt.strip()
+    if not prompt:
+        return {"success": False, "detail": "Prompt cannot be empty.", **chat_status()}
+    prompt_ids = list(tokenizer.encode(prompt, out_type=int))
+    generated = _generate_token_ids(prompt_ids, request.max_new_tokens)
+    completion_ids = generated[len(prompt_ids) :]
+    return {
+        "success": True,
+        "prompt": prompt,
+        "response": tokenizer.decode(completion_ids),
+        "input_ids": prompt_ids,
+        "output_ids": generated,
+        "new_token_ids": completion_ids,
+        **chat_status(),
+    }
 def _health_action(_: dict[str, object]) -> dict[str, object]:

serve/server_cpu.py CHANGED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 import shutil
 from fastapi import FastAPI
 from serve.control_plane import build_control_router
@@ -12,10 +13,37 @@ from serve.control_plane import build_control_router
 app = FastAPI(title="SAGE CPU Server")
 @app.get("/health")
 def health() -> dict[str, object]:
     """Report llama.cpp availability for CPU serving."""
-    return {"status": "ok", "llama_cpp_available": shutil.which("llama-server") is not None}
 def _health_action(_: dict[str, object]) -> dict[str, object]:

 import shutil
 from fastapi import FastAPI
+from pydantic import BaseModel
 from serve.control_plane import build_control_router
 app = FastAPI(title="SAGE CPU Server")
+class ChatRequest(BaseModel):
+    """Request schema for the browser chat surface."""
+    prompt: str
+    max_new_tokens: int = 64
 @app.get("/health")
 def health() -> dict[str, object]:
     """Report llama.cpp availability for CPU serving."""
+    return {"status": "ok", "llama_cpp_available": shutil.which("llama-server") is not None, "chat": chat_status()}
+def chat_status() -> dict[str, object]:
+    """Return chat readiness for the CPU server."""
+    return {
+        "available": False,
+        "warning": "Browser chat is only wired to the PyTorch GPU server in this repo. Use serve.server:app for direct interaction.",
+    }
+@app.get("/chat/status")
+def get_chat_status() -> dict[str, object]:
+    """Expose browser-chat readiness."""
+    return chat_status()
+@app.post("/chat")
+def chat(_: ChatRequest) -> dict[str, object]:
+    """Return a clear error for CPU-only control-plane mode."""
+    return {"success": False, "detail": chat_status()["warning"], **chat_status()}
 def _health_action(_: dict[str, object]) -> dict[str, object]:

serve/static/index.html CHANGED Viewed

@@ -139,6 +139,36 @@
       font-size: 12px;
       color: var(--muted);
     }
     @media (max-width: 980px) {
       .grid { grid-template-columns: 1fr; }
     }
@@ -188,6 +218,21 @@
           <h2>API Result</h2>
           <div id="api-result" class="mono">No API result yet.</div>
         </section>
       </div>
       <div>
@@ -215,6 +260,7 @@
       selectedJobId: null,
       stream: null,
       pollTimer: null,
     };
     const loginPanel = document.getElementById("login-panel");
@@ -229,6 +275,11 @@
     const logsEl = document.getElementById("logs");
     const selectedJobEl = document.getElementById("selected-job");
     const apiResultEl = document.getElementById("api-result");
     async function api(path, options = {}) {
       const response = await fetch(path, {
@@ -265,6 +316,28 @@
       return state.presets.find((item) => item.id === presetSelect.value);
     }
     function renderPresetFields() {
       const preset = currentPreset();
       presetFields.innerHTML = "";
@@ -275,7 +348,7 @@
       for (const field of preset.fields) {
         const wrapper = document.createElement("div");
         const label = document.createElement("label");
-        label.textContent = field.label;
         label.htmlFor = `field-${field.name}`;
         wrapper.appendChild(label);
@@ -308,6 +381,7 @@
         input.id = `field-${field.name}`;
         input.dataset.kind = field.kind;
         input.dataset.name = field.name;
         input.placeholder = field.placeholder || "";
         wrapper.appendChild(input);
         presetFields.appendChild(wrapper);
@@ -319,13 +393,20 @@
       for (const element of presetFields.querySelectorAll("[data-name]")) {
         const name = element.dataset.name;
         const kind = element.dataset.kind;
         if (kind === "boolean") {
           args[name] = element.checked;
         } else if (kind === "number") {
           args[name] = element.value === "" ? "" : Number(element.value);
         } else if (kind === "json") {
           args[name] = element.value.trim() ? JSON.parse(element.value) : null;
         } else {
           args[name] = element.value;
         }
       }
@@ -388,6 +469,15 @@
       renderJobs(payload.jobs || []);
     }
     function appendLog(line) {
       if (!line) {
         return;
@@ -432,6 +522,7 @@
         presetSelect.appendChild(option);
       }
       renderPresetFields();
     }
     async function login() {
@@ -492,6 +583,35 @@
         apiResultEl.textContent = String(error.message || error);
       }
     });
     loadPresets().then(() => {
       showApp();

       font-size: 12px;
       color: var(--muted);
     }
+    .chat-status {
+      margin-bottom: 12px;
+      font-size: 13px;
+      color: var(--muted);
+    }
+    .chat-messages {
+      min-height: 220px;
+      max-height: 360px;
+      overflow: auto;
+      padding: 12px;
+      border-radius: 10px;
+      border: 1px solid var(--panel-border);
+      background: #0b1015;
+      display: grid;
+      gap: 10px;
+    }
+    .message {
+      border-radius: 10px;
+      padding: 10px 12px;
+      white-space: pre-wrap;
+      line-height: 1.5;
+    }
+    .message-user {
+      background: #0f2330;
+      border: 1px solid #1e3c50;
+    }
+    .message-model {
+      background: #151b22;
+      border: 1px solid var(--panel-border);
+    }
     @media (max-width: 980px) {
       .grid { grid-template-columns: 1fr; }
     }
           <h2>API Result</h2>
           <div id="api-result" class="mono">No API result yet.</div>
         </section>
+        <section class="panel">
+          <h2>Model Chat</h2>
+          <div id="chat-status" class="chat-status">Checking chat status...</div>
+          <div id="chat-messages" class="chat-messages">
+            <div class="message message-model">Prompt the local model here once a tokenizer exists. If no checkpoint is loaded yet, outputs will be random.</div>
+          </div>
+          <label for="chat-prompt">Prompt</label>
+          <textarea id="chat-prompt" placeholder="Explain what data files I need before training this repo."></textarea>
+          <label for="chat-max-tokens">Max New Tokens</label>
+          <input id="chat-max-tokens" type="number" value="96" min="1" max="512">
+          <div class="button-row">
+            <button id="chat-send" class="primary">Send Prompt</button>
+          </div>
+        </section>
       </div>
       <div>
       selectedJobId: null,
       stream: null,
       pollTimer: null,
+      chatStatus: null,
     };
     const loginPanel = document.getElementById("login-panel");
     const logsEl = document.getElementById("logs");
     const selectedJobEl = document.getElementById("selected-job");
     const apiResultEl = document.getElementById("api-result");
+    const chatStatusEl = document.getElementById("chat-status");
+    const chatMessagesEl = document.getElementById("chat-messages");
+    const chatPromptEl = document.getElementById("chat-prompt");
+    const chatMaxTokensEl = document.getElementById("chat-max-tokens");
+    const chatSendEl = document.getElementById("chat-send");
     async function api(path, options = {}) {
       const response = await fetch(path, {
       return state.presets.find((item) => item.id === presetSelect.value);
     }
+    function appendMessage(role, text) {
+      const block = document.createElement("div");
+      block.className = `message ${role === "user" ? "message-user" : "message-model"}`;
+      block.textContent = text;
+      chatMessagesEl.appendChild(block);
+      chatMessagesEl.scrollTop = chatMessagesEl.scrollHeight;
+    }
+    function renderChatStatus(payload) {
+      state.chatStatus = payload;
+      const warning = payload && payload.warning ? ` ${payload.warning}` : "";
+      if (!payload) {
+        chatStatusEl.textContent = "Chat status unavailable.";
+        chatSendEl.disabled = true;
+        return;
+      }
+      chatStatusEl.textContent = payload.available
+        ? `Chat ready.${warning}`
+        : `Chat unavailable.${warning}`;
+      chatSendEl.disabled = !payload.available;
+    }
     function renderPresetFields() {
       const preset = currentPreset();
       presetFields.innerHTML = "";
       for (const field of preset.fields) {
         const wrapper = document.createElement("div");
         const label = document.createElement("label");
+        label.textContent = field.required ? `${field.label} *` : field.label;
         label.htmlFor = `field-${field.name}`;
         wrapper.appendChild(label);
         input.id = `field-${field.name}`;
         input.dataset.kind = field.kind;
         input.dataset.name = field.name;
+        input.dataset.required = field.required ? "true" : "false";
         input.placeholder = field.placeholder || "";
         wrapper.appendChild(input);
         presetFields.appendChild(wrapper);
       for (const element of presetFields.querySelectorAll("[data-name]")) {
         const name = element.dataset.name;
         const kind = element.dataset.kind;
+        const required = element.dataset.required === "true";
         if (kind === "boolean") {
           args[name] = element.checked;
         } else if (kind === "number") {
           args[name] = element.value === "" ? "" : Number(element.value);
         } else if (kind === "json") {
+          if (required && !element.value.trim()) {
+            throw new Error(`Field ${name} is required.`);
+          }
           args[name] = element.value.trim() ? JSON.parse(element.value) : null;
         } else {
+          if (required && !element.value.trim()) {
+            throw new Error(`Field ${name} is required.`);
+          }
           args[name] = element.value;
         }
       }
       renderJobs(payload.jobs || []);
     }
+    async function loadChatStatus() {
+      try {
+        const payload = await api("/chat/status");
+        renderChatStatus(payload);
+      } catch (error) {
+        renderChatStatus({ available: false, warning: String(error.message || error) });
+      }
+    }
     function appendLog(line) {
       if (!line) {
         return;
         presetSelect.appendChild(option);
       }
       renderPresetFields();
+      await loadChatStatus();
     }
     async function login() {
         apiResultEl.textContent = String(error.message || error);
       }
     });
+    chatSendEl.addEventListener("click", async () => {
+      const prompt = chatPromptEl.value.trim();
+      if (!prompt) {
+        apiResultEl.textContent = "Prompt cannot be empty.";
+        return;
+      }
+      appendMessage("user", prompt);
+      chatPromptEl.value = "";
+      chatSendEl.disabled = true;
+      try {
+        const payload = await api("/chat", {
+          method: "POST",
+          body: JSON.stringify({
+            prompt,
+            max_new_tokens: Number(chatMaxTokensEl.value || 96),
+          }),
+        });
+        renderChatStatus(payload);
+        if (!payload.success) {
+          appendMessage("model", payload.detail || "Chat failed.");
+          return;
+        }
+        appendMessage("model", payload.response || "[empty response]");
+      } catch (error) {
+        appendMessage("model", String(error.message || error));
+      } finally {
+        chatSendEl.disabled = !(state.chatStatus && state.chatStatus.available);
+      }
+    });
     loadPresets().then(() => {
       showApp();

tests/test_control_plane.py CHANGED Viewed

@@ -50,6 +50,8 @@ def test_login_and_html_index(monkeypatch) -> None:
     assert response.status_code == 200
     payload = response.json()
     preset_ids = {item["id"] for item in payload["presets"]}
     assert "serve_cpu" in preset_ids
     assert "git_status" in preset_ids
@@ -120,3 +122,14 @@ def test_health_api_preset(monkeypatch) -> None:
     payload = response.json()
     assert payload["kind"] == "api"
     assert payload["result"]["status"] == "ok"

     assert response.status_code == 200
     payload = response.json()
     preset_ids = {item["id"] for item in payload["presets"]}
+    assert "data_bootstrap" in preset_ids
+    assert "data_pipeline" in preset_ids
     assert "serve_cpu" in preset_ids
     assert "git_status" in preset_ids
     payload = response.json()
     assert payload["kind"] == "api"
     assert payload["result"]["status"] == "ok"
+def test_required_preset_field_validation(monkeypatch) -> None:
+    monkeypatch.setenv("SAGE_WEB_PASSWORD", "test-password")
+    CONTROL_MANAGER.reset_for_tests()
+    client = TestClient(app)
+    _login(client)
+    response = client.post("/api/commands/run", json={"preset_id": "tokenizer_train", "args": {"input_paths": ""}})
+    assert response.status_code == 400
+    assert "Input Paths" in response.json()["detail"]

tests/test_data_pipeline.py CHANGED Viewed

@@ -1,6 +1,13 @@
 from data.dataset import pack_sequence
 from data.dedup import deduplicate_records
 from data.filter import filter_record
 def test_filter_record_masks_pii() -> None:
@@ -31,3 +38,48 @@ def test_pack_sequence_shapes() -> None:
     assert packed["input_ids"].tolist() == [1, 2, 3, 4]
     assert packed["labels"].tolist() == [2, 3, 4, 5]
     assert packed["document_boundaries"].tolist() == [0, 0, 1, 0]

+import json
+from pathlib import Path
 from data.dataset import pack_sequence
+from data.bootstrap import bootstrap_raw_corpora
 from data.dedup import deduplicate_records
 from data.filter import filter_record
+from data.ingest import SourceSpec
+import pytest
 def test_filter_record_masks_pii() -> None:
     assert packed["input_ids"].tolist() == [1, 2, 3, 4]
     assert packed["labels"].tolist() == [2, 3, 4, 5]
     assert packed["document_boundaries"].tolist() == [0, 0, 1, 0]
+def test_bootstrap_raw_corpora_writes_jsonl(tmp_path: Path) -> None:
+    summary = bootstrap_raw_corpora(output_dir=str(tmp_path), overwrite=True)
+    assert summary["general_web"] > 0
+    sample_path = tmp_path / "general_web.jsonl"
+    first = json.loads(sample_path.read_text(encoding="utf-8").splitlines()[0])
+    assert "text" in first
+    assert len(first["text"]) >= 240
+def test_pipeline_writes_manifest_from_bootstrap_data(tmp_path: Path, monkeypatch) -> None:
+    pytest.importorskip("sentencepiece")
+    from data import pipeline
+    from tokenizer.train_tokenizer import train_sentencepiece, write_training_text
+    raw_dir = tmp_path / "raw"
+    bootstrap_raw_corpora(output_dir=str(raw_dir), overwrite=True)
+    training_text = tmp_path / "training.txt"
+    write_training_text([str(path) for path in raw_dir.glob("*.jsonl")], str(training_text))
+    prefix = tmp_path / "tokenizer"
+    train_sentencepiece(str(training_text), str(prefix), vocab_size=512)
+    registry = tuple(
+        SourceSpec(
+            name=path.stem,
+            domain_tag="general",
+            quality_tier="high",
+            license_category="permissive",
+            estimated_tokens=1_000,
+            path=str(path),
+        )
+        for path in raw_dir.glob("*.jsonl")
+    )
+    monkeypatch.setattr(pipeline, "SOURCE_REGISTRY", registry)
+    output_dir = tmp_path / "processed"
+    summary = pipeline.run_pipeline(
+        tokenizer_model=str(prefix) + ".model",
+        output_dir=str(output_dir),
+        shard_size=4,
+    )
+    manifest_path = output_dir / "manifest.json"
+    assert summary["records"] > 0
+    assert manifest_path.exists()

tests/test_servers.py CHANGED Viewed

@@ -45,3 +45,47 @@ def test_gpu_server_generate(monkeypatch) -> None:
     assert response.status_code == 200
     payload = response.json()
     assert payload["tokens"] == [1, 2, 3, 3, 3]

     assert response.status_code == 200
     payload = response.json()
     assert payload["tokens"] == [1, 2, 3, 3, 3]
+def test_gpu_server_chat(monkeypatch) -> None:
+    class FakeModel:
+        def eval(self) -> "FakeModel":
+            return self
+        def to(self, _device) -> "FakeModel":
+            return self
+        def __call__(self, input_ids, past_key_values=None):
+            import torch
+            batch, seq = input_ids.shape
+            logits = torch.zeros((batch, seq, 32), dtype=torch.float32)
+            logits[:, :, 7] = 1.0
+            cache = [] if past_key_values is None else past_key_values
+            return logits, cache
+    class FakeTokenizer:
+        def encode(self, text, out_type=int):
+            return [1, 2, 3]
+        def decode(self, ids):
+            return "decoded:" + ",".join(str(item) for item in ids)
+    monkeypatch.setattr(gpu_server, "get_model", lambda: FakeModel())
+    monkeypatch.setattr(gpu_server, "get_tokenizer", lambda: FakeTokenizer())
+    monkeypatch.setattr(gpu_server, "chat_status", lambda: {"available": True, "warning": None, "checkpoint_loaded": False})
+    monkeypatch.setattr(gpu_server.torch.cuda, "is_available", lambda: False)
+    client = TestClient(gpu_app)
+    response = client.post("/chat", json={"prompt": "hello", "max_new_tokens": 2})
+    assert response.status_code == 200
+    payload = response.json()
+    assert payload["success"] is True
+    assert payload["response"] == "decoded:7,7"
+def test_cpu_server_chat_status() -> None:
+    client = TestClient(cpu_app)
+    response = client.get("/chat/status")
+    assert response.status_code == 200
+    payload = response.json()
+    assert payload["available"] is False

tests/test_tokenizer.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from pathlib import Path
 import pytest
@@ -23,3 +24,22 @@ def test_validation_suite_roundtrip(tmp_path: Path) -> None:
     train_sentencepiece(str(corpus), str(prefix), vocab_size=512)
     results = run_validation_suite(str(prefix) + ".model")
     assert all(result.passed for result in results), results

 from pathlib import Path
+import json
 import pytest
     train_sentencepiece(str(corpus), str(prefix), vocab_size=512)
     results = run_validation_suite(str(prefix) + ".model")
     assert all(result.passed for result in results), results
+def test_write_training_text_reads_jsonl(tmp_path: Path) -> None:
+    from tokenizer.train_tokenizer import write_training_text
+    raw = tmp_path / "raw.jsonl"
+    raw.write_text(
+        "\n".join(
+            [
+                json.dumps({"text": "first training sample"}),
+                json.dumps({"text": "second training sample"}),
+            ]
+        )
+        + "\n",
+        encoding="utf-8",
+    )
+    output = tmp_path / "combined.txt"
+    write_training_text([str(raw)], str(output))
+    assert output.read_text(encoding="utf-8").splitlines() == ["first training sample", "second training sample"]

tokenizer/train_tokenizer.py CHANGED Viewed

@@ -3,32 +3,50 @@
 from __future__ import annotations
 import argparse
 from pathlib import Path
-from typing import Iterable
-import sentencepiece as spm
-DEFAULT_SPECIAL_TOKENS = ("<bos>", "<eos>", "<pad>", "<unk>", "[INST]", "[/INST]")
-def write_training_text(corpus_paths: Iterable[str], output_path: str) -> str:
     """Concatenate corpus text into a plain-text file for SentencePiece."""
     output = Path(output_path)
     output.parent.mkdir(parents=True, exist_ok=True)
     with output.open("w", encoding="utf-8") as sink:
-        for path in corpus_paths:
-            with Path(path).open("r", encoding="utf-8") as source:
-                for line in source:
-                    line = line.strip()
-                    if line:
-                        sink.write(line)
-                        sink.write("\n")
     return str(output)
 def train_sentencepiece(input_path: str, model_prefix: str, vocab_size: int = 50_000) -> None:
     """Train a byte-fallback SentencePiece BPE model."""
     spm.SentencePieceTrainer.train(
         input=input_path,
         model_prefix=model_prefix,
@@ -52,17 +70,18 @@ def train_sentencepiece(input_path: str, model_prefix: str, vocab_size: int = 50
 def build_argparser() -> argparse.ArgumentParser:
     """Build the CLI parser."""
     parser = argparse.ArgumentParser(description="Train the SAGE SentencePiece tokenizer.")
-    parser.add_argument("--input", nargs="+", required=True, help="Plain-text corpus files.")
     parser.add_argument("--model-prefix", default="tokenizer/tokenizer", help="SentencePiece model prefix.")
     parser.add_argument("--vocab-size", type=int, default=50_000, help="Tokenizer vocabulary size.")
     parser.add_argument("--training-text", default="tokenizer/training_corpus.txt", help="Temporary combined text file.")
     return parser
 def main() -> None:
     """Train the tokenizer from CLI arguments."""
     args = build_argparser().parse_args()
-    training_text = write_training_text(args.input, args.training_text)
     train_sentencepiece(training_text, args.model_prefix, args.vocab_size)

 from __future__ import annotations
 import argparse
+import json
 from pathlib import Path
+from typing import Iterable, Iterator
+DEFAULT_SPECIAL_TOKENS = ("<bos>", "<eos>", "<pad>", "<unk>", "[INST]", "[/INST]")
+def iter_training_text(corpus_paths: Iterable[str], text_key: str = "text") -> Iterator[str]:
+    """Yield training lines from plain-text or JSONL corpus files."""
+    for path in corpus_paths:
+        source = Path(path)
+        suffix = source.suffix.lower()
+        with source.open("r", encoding="utf-8") as handle:
+            if suffix == ".jsonl":
+                for raw_line in handle:
+                    raw_line = raw_line.strip()
+                    if not raw_line:
+                        continue
+                    payload = json.loads(raw_line)
+                    text = payload.get(text_key)
+                    if isinstance(text, str) and text.strip():
+                        yield text.strip()
+                continue
+            for raw_line in handle:
+                text = raw_line.strip()
+                if text:
+                    yield text
+def write_training_text(corpus_paths: Iterable[str], output_path: str, text_key: str = "text") -> str:
     """Concatenate corpus text into a plain-text file for SentencePiece."""
     output = Path(output_path)
     output.parent.mkdir(parents=True, exist_ok=True)
     with output.open("w", encoding="utf-8") as sink:
+        for line in iter_training_text(corpus_paths, text_key=text_key):
+            sink.write(line)
+            sink.write("\n")
     return str(output)
 def train_sentencepiece(input_path: str, model_prefix: str, vocab_size: int = 50_000) -> None:
     """Train a byte-fallback SentencePiece BPE model."""
+    import sentencepiece as spm
     spm.SentencePieceTrainer.train(
         input=input_path,
         model_prefix=model_prefix,
 def build_argparser() -> argparse.ArgumentParser:
     """Build the CLI parser."""
     parser = argparse.ArgumentParser(description="Train the SAGE SentencePiece tokenizer.")
+    parser.add_argument("--input", nargs="+", required=True, help="Plain-text or JSONL corpus files.")
     parser.add_argument("--model-prefix", default="tokenizer/tokenizer", help="SentencePiece model prefix.")
     parser.add_argument("--vocab-size", type=int, default=50_000, help="Tokenizer vocabulary size.")
     parser.add_argument("--training-text", default="tokenizer/training_corpus.txt", help="Temporary combined text file.")
+    parser.add_argument("--text-key", default="text", help="JSONL field to read when --input contains .jsonl files.")
     return parser
 def main() -> None:
     """Train the tokenizer from CLI arguments."""
     args = build_argparser().parse_args()
+    training_text = write_training_text(args.input, args.training_text, text_key=args.text_key)
     train_sentencepiece(training_text, args.model_prefix, args.vocab_size)