Spaces:

Really-Amazing
/

SimpleAI-259M

Sleeping

App Files Files Community

Really-Amazing commited on Mar 14

Commit

5c428ee

verified ·

1 Parent(s): de657c0

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -23

app.py CHANGED Viewed

@@ -2,47 +2,61 @@ import torch
 import gradio as gr
 from nanochat.engine import Engine
 from nanochat.tokenizer import get_tokenizer
 MODEL_PATH = "model_000971.pt"
 print("Waking up the toddler (NanoChat-ClimbMix-D12)...")
 tokenizer = get_tokenizer()
-print("Loading checkpoint directly...")
-checkpoint = torch.load(MODEL_PATH, map_location="cpu", weights_only=False)
-# Your checkpoint is a flat state_dict with 'transformer.' prefix
-# So we need the model class instance first
-# Option 1: If nanochat has a from_checkpoint or load method
-# (most likely in checkpoint_manager or engine)
-try:
-    from nanochat.checkpoint_manager import load_model
-    model, _ = load_model(".", checkpoint_name="model_000971.pt", device="cpu")
-except Exception as e:
-    print(f"checkpoint_manager failed: {e}")
-    # Option 2: Direct load if checkpoint is state_dict
-    state_dict = checkpoint
-    # We need a pre-initialized model to load into
-    # Since we can't build GPT without args, assume Engine can help or fallback
-    # For now, raise to see
-    raise ValueError("Cannot reconstruct model — checkpoint is flat state_dict. Need model skeleton or load method")
 model.to("cpu")
 model.eval()
-print("Model loaded!")
 engine = Engine(model=model, tokenizer=tokenizer)
 def chat_fn(message, history):
     return engine.generate(message, max_tokens=512, temperature=0.85)
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
-    gr.Markdown("# 🧸 NanoChat-ClimbMix-D12")
-    gr.ChatInterface(fn=chat_fn)
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 from nanochat.engine import Engine
 from nanochat.tokenizer import get_tokenizer
+from nanochat.gpt import GPT   # ← correct class
 MODEL_PATH = "model_000971.pt"
 print("Waking up the toddler (NanoChat-ClimbMix-D12)...")
+# Tokenizer (Docker fix already placed tokenizer.pkl correctly)
 tokenizer = get_tokenizer()
+print("Creating GPT model skeleton (D12 fallback)...")
+# Create blank model — use positional arguments (common in nanochat forks)
+# Order usually: vocab_size, n_layer, n_head, n_embd, block_size, dropout, ...
+model = GPT(
+    vocab_size=50257,     # GPT-2 base — most common
+    n_layer=12,
+    n_head=12,
+    n_embd=768,
+    block_size=1024,
+    dropout=0.1,
+    # If error about missing arg, add bias=True or other defaults here
+)
+print("Loading flat state_dict from checkpoint...")
+state_dict = torch.load(MODEL_PATH, map_location="cpu", weights_only=False)
+# Clean torch.compile prefix if present
+unwanted_prefix = '_orig_mod.'
+for k in list(state_dict.keys()):
+    if k.startswith(unwanted_prefix):
+        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
+# Load — strict=False ignores extra keys (value_embeds, lambdas, etc.)
+missing, unexpected = model.load_state_dict(state_dict, strict=False)
+print(f"Load info: {len(missing)} missing keys, {len(unexpected)} unexpected keys")
 model.to("cpu")
 model.eval()
+print("Model ready!")
 engine = Engine(model=model, tokenizer=tokenizer)
 def chat_fn(message, history):
+    # Use max_tokens as per your engine.py grep
     return engine.generate(message, max_tokens=512, temperature=0.85)
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
+    gr.Markdown("# 🧸 NanoChat-ClimbMix-D12 – Toddler Phase")
+    gr.Markdown("Confident, funny, wildly inaccurate. Maturing fast → D14/D16/D18 soon!")
+    gr.ChatInterface(
+        fn=chat_fn,
+        examples=["Why is the sky blue?", "What is UPI?", "Write hello world Python"],
+        title="Chat with the Toddler"
+    )
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=7860)