Spaces:

ProCreations
/

Intellite-500M

Running on Zero

File size: 14,660 Bytes

"""intellite 500M SFT — RLHF data collector served as a Gradio HuggingFace Space.

Every assistant reply gets 👍 / 👎 buttons. When the user rates a reply,
the (system, prior messages, response, liked) tuple is appended to a
local JSONL file, and a CommitScheduler pushes that folder to a dataset
repo on the Hub every 5 minutes.

Weights are downloaded at startup from the `ProCreations/intellite-500m-sft`
model repo (the Space itself is capped at 1 GB LFS so we can't bundle them).

Environment variables:
    INTELLITE_MODEL_REPO   model repo id (default: ProCreations/intellite-500m-sft)
    HF_TOKEN               HF access token with *write* scope on the dataset
                           repo (REQUIRED — set as a Space secret)
    FEEDBACK_REPO          dataset repo id (default: ProCreations/Intellite-storage)
"""

import json
import os
import sys
import threading
import time
import traceback
import uuid
from pathlib import Path

import gradio as gr
import spaces        # HF ZeroGPU — provides @spaces.GPU decorator + module-level CUDA emulation
import tiktoken
import torch
from huggingface_hub import CommitScheduler, hf_hub_download
from safetensors.torch import load_file as load_safetensors

SPACE_DIR = Path(__file__).resolve().parent
sys.path.insert(0, str(SPACE_DIR))

from config import ModelConfig
from model import IntelliteGPT

# ------------------------------------------------------------------------
# Paths & constants

MODEL_REPO = os.environ.get("INTELLITE_MODEL_REPO", "ProCreations/intellite-500m-sft")
FEEDBACK_DIR = SPACE_DIR / "user_feedback"
FEEDBACK_DIR.mkdir(exist_ok=True)
# Unique filename per replica/restart so concurrent Spaces don't clobber.
FEEDBACK_FILE = FEEDBACK_DIR / f"data_{uuid.uuid4().hex}.jsonl"

FEEDBACK_REPO = os.environ.get("FEEDBACK_REPO", "ProCreations/Intellite-storage")
HF_TOKEN = os.environ.get("HF_TOKEN")

DEFAULT_SYSTEM = ""   # empty by default — adding a system prompt empirically hurts this checkpoint's quality
SYSTEM_TAG = "<|system|>\n"
USER_TAG = "<|user|>\n"
ASST_TAG = "<|assistant|>\n"
STOP_MARKERS = ("<|user|>", "<|system|>")


# ------------------------------------------------------------------------
# Model load (once, at startup)

# ZeroGPU: place the model on "cuda" unconditionally at module-level. PyTorch
# CUDA emulation handles this without a real GPU; the real H200 slice is only
# allocated inside @spaces.GPU functions. Outside ZeroGPU (local dev) this
# falls back to CPU.
DEVICE = "cuda" if (torch.cuda.is_available() or os.environ.get("SPACES_ZERO_GPU")) else "cpu"
print(f"[sys] device={DEVICE}  model_repo={MODEL_REPO}  zerogpu={bool(os.environ.get('SPACES_ZERO_GPU'))}")

# Pull architecture + weights from the Hub. First call downloads (~30 s for
# 1 GB on cold start); subsequent calls hit HF's local cache.
print(f"[hub] downloading config.json + model.safetensors from {MODEL_REPO}")
config_path = hf_hub_download(repo_id=MODEL_REPO, filename="config.json")
weights_path = hf_hub_download(repo_id=MODEL_REPO, filename="model.safetensors")
with open(config_path) as f:
    hub_cfg = json.load(f)

# Build our intellite ModelConfig from whatever fields HF's config.json carries
# that match. Anything missing falls back to dataclass defaults.
_fields = ModelConfig.__dataclass_fields__.keys()
MCFG = ModelConfig(**{k: v for k, v in hub_cfg.items() if k in _fields})

MODEL = IntelliteGPT(MCFG).to(DEVICE)

# Load weights as bf16 (matches what's in safetensors). Keep model in bf16 to
# halve memory and roughly double CPU inference speed vs fp32.
state = load_safetensors(weights_path, device=str(DEVICE))
_wdtype = next(iter(state.values())).dtype
if _wdtype != torch.float32:
    MODEL = MODEL.to(_wdtype)

# `lm_head.weight` was deduped from safetensors (tied to tok_emb.weight).
# IntelliteGPT.__init__ already ties them to the same tensor, so a single
# load of tok_emb.weight populates both — strict=False allows the missing key.
missing, unexpected = MODEL.load_state_dict(state, strict=False)
if unexpected:
    print(f"[load] unexpected keys (ignored): {unexpected}")
if missing and missing != ["lm_head.weight"]:
    print(f"[load] missing keys: {missing}")
MODEL.eval()

TOKENS_SEEN = 0   # not stored in the safetensors-only repo format
BEST_VAL = float("nan")

ENC = tiktoken.get_encoding("gpt2")
EOT = ENC.eot_token
N_PARAMS = MODEL.num_params()
print(f"[model] {N_PARAMS/1e6:.1f}M params  tokens_seen={TOKENS_SEEN:,}  best_val={BEST_VAL:.4f}")


# ------------------------------------------------------------------------
# Hub sync — CommitScheduler pushes FEEDBACK_DIR to the dataset every 5 min.

if HF_TOKEN:
    scheduler = CommitScheduler(
        repo_id=FEEDBACK_REPO,
        repo_type="dataset",
        folder_path=FEEDBACK_DIR,
        path_in_repo="data",
        every=5,
        token=HF_TOKEN,
    )
    print(f"[hub] scheduler active → {FEEDBACK_REPO} (every 5 min)")
else:
    scheduler = None
    print("[hub] HF_TOKEN not set — feedback will stay local only")


# ------------------------------------------------------------------------
# Prompt templating + generation (mirrors chat.py)

def render_prompt_ids(system: str, prior_messages: list[dict], user_msg: str) -> list[int]:
    """Encode the SFT chat template exactly as sft_prepare.py did."""
    ids: list[int] = []
    if system:
        ids.extend(ENC.encode_ordinary(SYSTEM_TAG + system.strip() + "\n"))
    pending_user = None
    for m in prior_messages:
        role = m.get("role")
        content = (m.get("content") or "").strip()
        if role == "user":
            pending_user = content
        elif role == "assistant" and pending_user is not None:
            ids.extend(ENC.encode_ordinary(USER_TAG + pending_user + "\n"))
            ids.extend(ENC.encode_ordinary(ASST_TAG))
            ids.extend(ENC.encode_ordinary(content))
            ids.append(EOT)
            pending_user = None
    ids.extend(ENC.encode_ordinary(USER_TAG + user_msg.strip() + "\n"))
    ids.extend(ENC.encode_ordinary(ASST_TAG))
    return ids


@torch.no_grad()
def stream_reply(prompt_ids, max_new, temperature, top_k, top_p, rep_penalty):
    """Yield the partial assistant reply after each new token."""
    x = torch.tensor([prompt_ids], dtype=torch.long, device=DEVICE)
    ctx = MCFG.seq_len
    start = len(prompt_ids)
    reply = ""

    for _ in range(max_new):
        xc = x[:, -ctx:]
        if DEVICE == "cuda":
            with torch.autocast(device_type="cuda", dtype=torch.bfloat16):
                logits, _ = MODEL(xc)
        else:
            logits, _ = MODEL(xc)
        logits = logits[0, -1, :].float()

        if rep_penalty and rep_penalty != 1.0:
            seen = torch.unique(x[0])
            prev = logits[seen]
            logits[seen] = torch.where(prev > 0, prev / rep_penalty, prev * rep_penalty)

        logits = logits / max(temperature, 1e-5)

        if top_k and top_k > 0:
            k = min(int(top_k), logits.numel())
            v, _ = torch.topk(logits, k)
            logits[logits < v[-1]] = -float("inf")

        if top_p and 0.0 < top_p < 1.0:
            sorted_logits, sorted_idx = torch.sort(logits, descending=True)
            cum = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
            mask = cum > top_p
            mask[1:] = mask[:-1].clone()
            mask[0] = False
            logits[sorted_idx[mask]] = -float("inf")

        probs = torch.softmax(logits, dim=-1)
        next_tok = torch.multinomial(probs, num_samples=1)
        tok_id = int(next_tok.item())
        x = torch.cat([x, next_tok.unsqueeze(0)], dim=1)

        if tok_id == EOT:
            break

        reply = ENC.decode(x[0, start:].tolist())

        while reply.endswith("\ufffd"):
            reply = reply[:-1]

        hit_stop = False
        for marker in STOP_MARKERS:
            idx = reply.find(marker)
            if idx != -1:
                reply = reply[:idx]
                hit_stop = True
                break
        if hit_stop:
            break

        yield reply.strip()

    yield reply.strip()


# ------------------------------------------------------------------------
# Feedback store — JSONL, append-only, synced to Hub by CommitScheduler.

_local_lock = threading.Lock()
_local_count = {"total": 0, "liked": 0}


def _count_jsonl_lines(path: Path) -> tuple[int, int]:
    total, liked = 0, 0
    if not path.exists():
        return 0, 0
    with path.open() as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            total += 1
            try:
                if json.loads(line).get("liked"):
                    liked += 1
            except json.JSONDecodeError:
                pass
    return total, liked


t, l = _count_jsonl_lines(FEEDBACK_FILE)
_local_count["total"], _local_count["liked"] = t, l


def _stats_str() -> str:
    t = _local_count["total"]
    l = _local_count["liked"]
    repo_link = f"[`{FEEDBACK_REPO}`](https://huggingface.co/datasets/{FEEDBACK_REPO})"
    sync = "synced every 5 min" if scheduler else "**HF_TOKEN missing — not syncing**"
    return (
        f"**{t}** records this session · 👍 {l} · 👎 {t - l}  \n"
        f"Pushed to {repo_link} ({sync})"
    )


def save_feedback(evt: gr.LikeData, history: list, system: str) -> str:
    """Handle a thumbs-up / thumbs-down click on a chat message."""
    if evt.liked is None:
        return "rating cleared (nothing saved)"

    idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
    if not isinstance(idx, int) or idx < 0 or idx >= len(history):
        return f"bad index {evt.index!r}"

    msg = history[idx]
    if msg.get("role") != "assistant":
        return "skipped non-assistant message"

    record = {
        "ts": time.strftime("%Y-%m-%dT%H:%M:%S"),
        "system": (system or DEFAULT_SYSTEM).strip(),
        "prompt_messages": history[:idx],
        "response": msg.get("content", ""),
        "liked": bool(evt.liked),
    }

    # Write under the scheduler's lock (or our own) so the background push
    # never sees a half-written line.
    lock = scheduler.lock if scheduler else _local_lock
    with lock:
        with FEEDBACK_FILE.open("a") as f:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")
        _local_count["total"] += 1
        if record["liked"]:
            _local_count["liked"] += 1

    verdict = "👍 good" if evt.liked else "👎 bad"
    return f"saved {verdict} · {_local_count['total']} this session"


# ------------------------------------------------------------------------
# Chat callback

@spaces.GPU(duration=60)
def chat(user_msg, history, system, max_new, temperature, top_k, top_p, rep_penalty):
    """Stream a reply; yield updated chatbot history after each token.

    Decorated with @spaces.GPU so ZeroGPU allocates a half-H200 slice for
    the duration of the generator. 500M dense at ~80 tok/s on H200 means
    a max-length 800-token reply finishes in ~10 s — well under the 60 s cap.
    """
    user_msg = (user_msg or "").strip()
    if not user_msg:
        yield history, ""
        return

    history = list(history) + [
        {"role": "user", "content": user_msg},
        {"role": "assistant", "content": ""},
    ]
    prior = history[:-2]

    ids = render_prompt_ids(system or DEFAULT_SYSTEM, prior, user_msg)
    room = MCFG.seq_len - int(max_new)
    if len(ids) > room > 0:
        ids = ids[-room:]

    try:
        for partial in stream_reply(ids, int(max_new), float(temperature),
                                    int(top_k), float(top_p), float(rep_penalty)):
            history[-1]["content"] = partial
            yield history, ""
    except Exception:
        history[-1]["content"] = f"[error] {traceback.format_exc()}"
        yield history, ""


# ------------------------------------------------------------------------
# UI

with gr.Blocks(
    title="intellite 500M SFT — RLHF collector",
    theme=gr.themes.Soft(primary_hue="indigo", neutral_hue="slate"),
) as demo:
    gr.Markdown(
        f"# intellite 500M SFT — RLHF data collector\n"
        f"{MCFG.d_model}d × {MCFG.n_layers}L × {MCFG.n_heads}h "
        f"({N_PARAMS/1e6:.1f}M params, bf16) · "
        f"weights from [`{MODEL_REPO}`](https://huggingface.co/{MODEL_REPO}) · "
        f"device `{DEVICE}`  \n"
        f"**Please rate every response with 👍 or 👎.** Ratings auto-sync to "
        f"[`{FEEDBACK_REPO}`](https://huggingface.co/datasets/{FEEDBACK_REPO}) "
        f"every 5 minutes for RLHF training."
    )

    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(
                type="messages",
                height=520,
                show_copy_button=True,
                avatar_images=(None, None),
            )
            msg = gr.Textbox(
                placeholder="Your message — Enter to send",
                lines=2,
                show_label=False,
                autofocus=True,
            )
            with gr.Row():
                send_btn = gr.Button("Send", variant="primary")
                clear_btn = gr.Button("Clear chat")
            feedback_status = gr.Markdown("_rate replies with 👍 / 👎_")

        with gr.Column(scale=1):
            system = gr.Textbox(
                value=DEFAULT_SYSTEM,
                label="System prompt (optional — leave blank for best quality)",
                placeholder="(none — model behaves better without one)",
                lines=3,
            )
            max_new = gr.Slider(16, 800, value=400, step=16, label="max new tokens")
            temp = gr.Slider(0.1, 1.5, value=0.7, step=0.05, label="temperature")
            top_k = gr.Slider(0, 200, value=40, step=1, label="top-k (0 = off)")
            top_p = gr.Slider(0.1, 1.0, value=0.7, step=0.05, label="top-p")
            rep = gr.Slider(1.0, 1.5, value=1.1, step=0.01, label="repetition penalty")

            gr.Markdown("### RLHF data")
            stats_md = gr.Markdown(_stats_str())

    send_btn.click(
        chat,
        inputs=[msg, chatbot, system, max_new, temp, top_k, top_p, rep],
        outputs=[chatbot, msg],
    )
    msg.submit(
        chat,
        inputs=[msg, chatbot, system, max_new, temp, top_k, top_p, rep],
        outputs=[chatbot, msg],
    )
    clear_btn.click(lambda: [], None, chatbot, queue=False)

    chatbot.like(
        save_feedback,
        inputs=[chatbot, system],
        outputs=[feedback_status],
    ).then(lambda: _stats_str(), None, stats_md, queue=False)


demo.queue()

if __name__ == "__main__":
    demo.launch()