suraj-self commited on
Commit
f62dc29
·
1 Parent(s): e9e19db
Files changed (3) hide show
  1. Dockerfile +19 -12
  2. app.py +82 -85
  3. requirements.txt +3 -3
Dockerfile CHANGED
@@ -2,25 +2,32 @@ FROM python:3.10-slim
2
 
3
  WORKDIR /app
4
 
5
- # system deps
6
  RUN apt-get update && apt-get install -y --no-install-recommends \
7
- git \
8
- libstdc++6 \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
- # install torch cpu first
12
- RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
13
-
14
- # install python deps
15
  COPY requirements.txt .
16
  RUN pip install --no-cache-dir -r requirements.txt
17
 
18
- # install rustbpe directly
19
- RUN pip install rustbpe
20
-
21
- # copy repo
22
  COPY . .
23
 
24
- ENV GRADIO_SERVER_NAME=0.0.0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  CMD ["python", "app.py"]
 
2
 
3
  WORKDIR /app
4
 
5
+ # Install build tools for Rust-based components
6
  RUN apt-get update && apt-get install -y --no-install-recommends \
7
+ build-essential curl git rustc cargo \
 
8
  && rm -rf /var/lib/apt/lists/*
9
 
10
+ # Install python dependencies
 
 
 
11
  COPY requirements.txt .
12
  RUN pip install --no-cache-dir -r requirements.txt
13
 
14
+ # Copy all files from your repo root to /app
 
 
 
15
  COPY . .
16
 
17
+ # --- THE CRITICAL FIX ---
18
+ # Nanochat looks for these in a specific hidden path.
19
+ # We create that path and copy your uploaded files there.
20
+ RUN mkdir -p /root/.cache/nanochat/tokenizer/ && \
21
+ cp tokenizer.pkl /root/.cache/nanochat/tokenizer/tokenizer.pkl && \
22
+ cp token_bytes.pt /root/.cache/nanochat/tokenizer/token_bytes.pt
23
+
24
+ # Ensure the Hugging Face 'user' (UID 1000) can also see them
25
+ RUN mkdir -p /.cache/nanochat/tokenizer/ && \
26
+ cp tokenizer.pkl /.cache/nanochat/tokenizer/tokenizer.pkl && \
27
+ cp token_bytes.pt /.cache/nanochat/tokenizer/token_bytes.pt && \
28
+ chmod -R 777 /.cache
29
+
30
+ EXPOSE 7860
31
+ ENV GRADIO_SERVER_NAME="0.0.0.0"
32
 
33
  CMD ["python", "app.py"]
app.py CHANGED
@@ -1,93 +1,90 @@
1
- import gradio as gr
2
  import torch
 
3
  from nanochat.gpt import GPT, GPTConfig
4
- from nanochat.tokenizer import get_tokenizer
5
- import json
6
- import os
7
-
8
- # --------------------------
9
- # 1) Load tokenizer
10
- # --------------------------
11
- tokenizer = get_tokenizer()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- # --------------------------
14
- # 2) Load model config & weights
15
- # --------------------------
16
- meta_path = "meta_000971.json"
17
- model_path = "model_000971.pt"
18
-
19
- with open(meta_path, "r") as f:
20
- meta = json.load(f)
21
-
22
- config = GPTConfig(**meta["model_config"])
23
  model = GPT(config)
24
- checkpoint = torch.load(model_path, map_location="cpu")
25
- model.load_state_dict(checkpoint)
26
- model.eval()
27
-
28
- # Optional: Torch compile for CPU optimization
29
- try:
30
- model = torch.compile(model)
31
- except Exception as e:
32
- print(f"Torch compile skipped: {e}")
33
-
34
- # --------------------------
35
- # 3) Helper functions
36
- # --------------------------
37
- def chat_with_model(conversation_history, user_input, max_tokens=128, temperature=0.8, top_k=40):
38
- """
39
- conversation_history: list of {"role": "user"/"assistant", "content": str}
40
- user_input: str
41
- Returns updated conversation and assistant's response
42
- """
43
- # Append user's message
44
- conversation_history.append({"role": "user", "content": user_input})
45
 
46
- # Render tokens for completion
47
- conv_for_gen = {"messages": conversation_history + [{"role": "assistant", "content": ""}]}
48
- input_ids = tokenizer.render_for_completion(conv_for_gen)
49
-
50
- # Generate tokens
51
- output_ids = []
52
- for token_id in model.generate(input_ids, max_tokens=max_tokens, temperature=temperature, top_k=top_k):
53
- output_ids.append(token_id)
54
-
55
- # Decode assistant's response
56
- assistant_response = tokenizer.decode(output_ids)
57
-
58
- # Append assistant's message
59
- conversation_history.append({"role": "assistant", "content": assistant_response})
60
-
61
- return conversation_history, assistant_response
62
-
63
- # --------------------------
64
- # 4) Gradio UI
65
- # --------------------------
66
- with gr.Blocks() as demo:
67
- gr.Markdown("## NanoChat ClimbMix D12 🐍\nCPU-friendly GPT chat")
68
-
69
- chatbot = gr.Chatbot()
70
- msg = gr.Textbox(label="Your message")
71
- clear = gr.Button("Clear")
72
-
73
- def user_send(message, history):
74
- history = history or []
75
- conversation_history = [{"role": "user" if i % 2 == 0 else "assistant", "content": m} for i, m in enumerate(sum(history, ()))]
76
-
77
- history, assistant_response = chat_with_model(conversation_history, message)
78
- # Convert to Gradio-friendly format: list of (user, assistant) tuples
79
- gr_history = []
80
- for i in range(0, len(history), 2):
81
- user_msg = history[i]["content"]
82
- assistant_msg = history[i + 1]["content"] if i + 1 < len(history) else ""
83
- gr_history.append((user_msg, assistant_msg))
84
- return gr_history, ""
85
-
86
- msg.submit(user_send, [msg, chatbot], [chatbot, msg])
87
- clear.click(lambda: None, None, chatbot, queue=False)
 
 
 
 
 
 
 
 
 
88
 
89
- # --------------------------
90
- # 5) Launch
91
- # --------------------------
92
  if __name__ == "__main__":
93
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
1
+ import os
2
  import torch
3
+ import gradio as gr
4
  from nanochat.gpt import GPT, GPTConfig
5
+ from nanochat.tokenizer import RustBPETokenizer
6
+
7
+ # Logic to find the tokenizer files
8
+ # 1. Check local root, 2. Check the hidden cache
9
+ local_path = "."
10
+ cache_path = os.path.expanduser("~/.cache/nanochat/tokenizer/")
11
+ TOKENIZER_DIR = local_path if os.path.exists(os.path.join(local_path, "token_bytes.pt")) else cache_path
12
+
13
+ print(f"--- System Initialization ---")
14
+ print(f"Loading tokenizer from: {os.path.abspath(TOKENIZER_DIR)}")
15
+
16
+ # Load Tokenizer
17
+ tokenizer = RustBPETokenizer.from_directory(TOKENIZER_DIR)
18
+
19
+ # Map IDs (Ensure these strings match your training config)
20
+ tokenizer.bos_token_id = tokenizer.enc.encode_single_token("<|bos|>")
21
+ tokenizer.user_start_id = tokenizer.enc.encode_single_token("<|user_start|>")
22
+ tokenizer.user_end_id = tokenizer.enc.encode_single_token("<|user_end|>")
23
+ tokenizer.assistant_start_id = tokenizer.enc.encode_single_token("<|assistant_start|>")
24
+ tokenizer.assistant_end_id = tokenizer.enc.encode_single_token("<|assistant_end|>")
25
+
26
+ # Model Architecture (D12 ClimbMix)
27
+ config = GPTConfig(
28
+ vocab_size=32768,
29
+ n_layer=12,
30
+ n_head=6,
31
+ n_embd=768,
32
+ sequence_len=2048
33
+ )
34
 
 
 
 
 
 
 
 
 
 
 
35
  model = GPT(config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
+ print("Loading model weights...")
38
+ state_dict = torch.load("model_000971.pt", map_location="cpu")
39
+ # Clean the '_orig_mod' prefix from compiled training
40
+ state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
41
+ model.load_state_dict(state_dict, strict=False)
42
+ model.eval()
43
+ print("Toddler is online!")
44
+
45
+ def predict(message, history):
46
+ # Prepare the sequence with Chat ML tags
47
+ tokens = [tokenizer.bos_token_id]
48
+ for human, assistant in history:
49
+ tokens.extend([tokenizer.user_start_id] + tokenizer.encode(human) + [tokenizer.user_end_id])
50
+ if assistant:
51
+ tokens.extend([tokenizer.assistant_start_id] + tokenizer.encode(assistant) + [tokenizer.assistant_end_id])
52
+
53
+ tokens.extend([tokenizer.user_start_id] + tokenizer.encode(message) + [tokenizer.user_end_id])
54
+ tokens.append(tokenizer.assistant_start_id)
55
+
56
+ input_ids = torch.tensor([tokens], dtype=torch.long)
57
+
58
+ with torch.no_grad():
59
+ # Using the standard generate call
60
+ output = model.generate(input_ids, max_tokens=512, temperature=0.8)
61
+
62
+ # Determine if output is streaming or static tensor
63
+ if isinstance(output, torch.Tensor):
64
+ # Static: Slice new tokens and decode
65
+ new_tokens = output[0][input_ids.shape[1]:]
66
+ response = tokenizer.decode(new_tokens.tolist())
67
+ # Clean up trailing tags
68
+ for tag in ["<|assistant_end|>", "<|end|>", "<|user_start|>"]:
69
+ response = response.split(tag)[0]
70
+ yield response.strip()
71
+ else:
72
+ # Streaming: Iterate through generator
73
+ generated_text = ""
74
+ for token in output:
75
+ token_id = token if isinstance(token, int) else token.item()
76
+ char = tokenizer.decode([token_id])
77
+ if "<|assistant_end|>" in char:
78
+ break
79
+ generated_text += char
80
+ yield generated_text.strip()
81
+
82
+ # Launching with Gradio 6.0 compatible settings
83
+ demo = gr.ChatInterface(
84
+ fn=predict,
85
+ title="🧸 NanoChat-D12",
86
+ description="Optimized for CPU inference."
87
+ )
88
 
 
 
 
89
  if __name__ == "__main__":
90
  demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
- gradio>=4.0
2
- tokenizers
3
- tiktoken
4
  numpy
 
5
  fsspec
6
  rustbpe
 
1
+ torch --index-url https://download.pytorch.org/whl/cpu
2
+ gradio
 
3
  numpy
4
+ tiktoken
5
  fsspec
6
  rustbpe