Spaces:

Johnblick187
/

Generations

Running on Zero

App Files Files Community

Generations / app.py

Johnblick187

Update app.py

ee88bdc verified 14 days ago

Raw

History Blame Contribute Delete

5.49 kB

	"""app.py — Tweaktron: Omni-Mythos generation Space (ZeroGPU).

	Pulls a chosen checkpoint step from the HF model repo and runs generation.
	Requires modeling_mythos.py and gdn2.py to be present alongside this file
	in the Space repo.
	"""

	import os
	import sys
	import subprocess

	import torch
	import gradio as gr
	import spaces
	from huggingface_hub import hf_hub_download, list_repo_files
	from transformers import AutoTokenizer

	# ---------------- mamba_src setup ----------------
	# Mamba3 runs on pure Triton kernels and never touches the compiled
	# selective_scan_cuda extension at runtime -- that extension is only used
	# by the legacy Mamba1/Mamba2 fallback path. `pip install mamba-ssm` tries
	# to build it anyway, which is fragile/slow in a Space build environment.
	# Instead: clone the source directly and stub the dead import.
	_MAMBA_SRC = "/home/user/mamba_src"
	if not os.path.exists(_MAMBA_SRC):
	subprocess.run(
	["git", "clone", "--depth", "1",
	"https://github.com/state-spaces/mamba", _MAMBA_SRC],
	check=True,
	)
	open(os.path.join(_MAMBA_SRC, "selective_scan_cuda.py"), "w").close()

	sys.path.insert(0, _MAMBA_SRC)
	# verify it actually imports before proceeding
	from mamba_ssm.modules.mamba3 import Mamba3 # noqa: F401
	# ---------------------------------------------------

	from modeling_mythos import OmniMythosDense, MythosConfig

	# ---------------- knobs ----------------
	REPO = "Johnblick187/TweaktronOmniMythosPrototype"
	TOKENIZER = "Johnblick187/Tweaktron-Omni-Mythos-Mini"
	LOOPS = 2
	DEFAULT_MAX_TOKENS = 200
	# ----------------------------------------

	_tok = AutoTokenizer.from_pretrained(TOKENIZER)
	_cfg = MythosConfig()
	_cfg.vocab_size = _tok.vocab_size

	_model_cache = {} # step_label -> loaded model (kept in CPU RAM, moved to GPU per-call)


	def list_available_steps():
	"""Discover which stepN.safetensors files actually exist in the repo."""
	try:
	files = list_repo_files(REPO)
	except Exception as e:
	return ["latest"], f"Could not list repo files: {e}"

	steps = []
	for f in files:
	if f.startswith("step") and f.endswith(".safetensors"):
	label = f.replace(".safetensors", "")
	steps.append(label)

	def step_num(label):
	if label == "latest":
	return float("inf")
	try:
	return int(label.replace("step", ""))
	except ValueError:
	return -1

	steps = sorted(set(steps), key=step_num)
	steps.append("latest")
	return steps, None


	def load_model(step_label):
	if step_label in _model_cache:
	return _model_cache[step_label]

	filename = f"{step_label}.safetensors"
	ckpt_path = hf_hub_download(repo_id=REPO, filename=filename)

	from safetensors.torch import load_file
	sd = load_file(ckpt_path)

	model = OmniMythosDense(_cfg).to(torch.bfloat16)
	missing, unexpected = model.load_state_dict(sd, strict=False)
	model.eval()

	_model_cache.clear() # only keep one checkpoint in memory at a time
	_model_cache[step_label] = model
	return model


	@spaces.GPU
	def generate(prompt, step_label, max_new_tokens, temperature, top_k):
	if not prompt or not prompt.strip():
	return "Enter a prompt first."

	model = load_model(step_label)
	model = model.to("cuda")

	ids = _tok.encode(prompt, add_special_tokens=False)
	x = torch.tensor([ids], dtype=torch.long, device="cuda")

	generated = list(ids)
	with torch.no_grad():
	for _ in range(int(max_new_tokens)):
	logits, _, _ = model(x, n_loops=LOOPS)
	next_logits = logits[0, -1, :] / max(temperature, 1e-5)

	if top_k > 0:
	topk_vals, topk_idx = torch.topk(next_logits, int(top_k))
	probs = torch.softmax(topk_vals, dim=-1)
	next_id = topk_idx[torch.multinomial(probs, 1)].item()
	else:
	probs = torch.softmax(next_logits, dim=-1)
	next_id = torch.multinomial(probs, 1).item()

	generated.append(next_id)
	x = torch.tensor([generated], dtype=torch.long, device="cuda")

	if next_id == _tok.eos_token_id:
	break

	model.to("cpu")
	torch.cuda.empty_cache()

	return _tok.decode(generated, skip_special_tokens=True)


	_step_choices, _step_error = list_available_steps()

	with gr.Blocks(title="Tweaktron: Omni-Mythos") as demo:
	gr.Markdown("# Tweaktron: Omni-Mythos\nGenerate text from a chosen training checkpoint.")
	if _step_error:
	gr.Markdown(f"⚠️ {_step_error}")

	with gr.Row():
	with gr.Column():
	prompt = gr.Textbox(label="Prompt", lines=4, placeholder="Once upon a time...")
	step = gr.Dropdown(choices=_step_choices,
	value=_step_choices[-1] if _step_choices else "latest",
	label="Checkpoint step")
	max_tokens = gr.Slider(10, 500, value=DEFAULT_MAX_TOKENS, step=10, label="Max new tokens")
	temperature = gr.Slider(0.1, 2.0, value=0.8, step=0.05, label="Temperature")
	top_k = gr.Slider(0, 100, value=40, step=1, label="Top-k (0 = disabled)")
	run_btn = gr.Button("Generate", variant="primary")
	with gr.Column():
	output = gr.Textbox(label="Output", lines=16)

	run_btn.click(generate, inputs=[prompt, step, max_tokens, temperature, top_k], outputs=output)

	if __name__ == "__main__":
	demo.launch()