twarner commited on
Commit
783cc24
·
1 Parent(s): 956dba9

Update to SD-Gcode end-to-end diffusion model

Browse files
Files changed (3) hide show
  1. README.md +21 -22
  2. app.py +176 -120
  3. requirements.txt +7 -7
README.md CHANGED
@@ -1,42 +1,41 @@
1
  ---
2
  title: dcode
3
- emoji: ✏️
4
- colorFrom: gray
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: "4.44.0"
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- hardware: t4-small
12
- short_description: Text to Polargraph Gcode via Latent Diffusion
13
  ---
14
 
15
  # dcode
16
 
17
- Generate polargraph-compatible gcode from text prompts using latent diffusion.
18
 
19
- ## How it works
20
 
21
- 1. **Text → Latent**: Stable Diffusion generates a latent representation from your text prompt
22
- 2. **Latent → Gcode**: Custom transformer decoder converts the latent to gcode commands
23
- 3. **Validation**: Coordinates are clamped to machine bounds
24
 
25
- ## Usage
 
 
26
 
27
- 1. Enter a prompt (e.g., "line drawing of a cat")
28
- 2. Adjust diffusion steps and guidance scale
29
- 3. Click Generate
30
- 4. View preview and copy gcode
31
 
32
- ## Model
33
 
34
- - Base: Stable Diffusion 2.1
35
- - Decoder: 6-layer transformer trained on 175k image-gcode pairs
36
- - Final loss: 0.107
37
 
38
  ## Links
39
 
40
- - [Model](https://huggingface.co/twarner/dcode-latent-gcode)
41
- - [Dataset](https://huggingface.co/datasets/twarner/dcode-polargraph-gcode)
42
  - [GitHub](https://github.com/Twarner491/dcode)
 
 
 
 
 
 
 
1
  ---
2
  title: dcode
3
+ emoji: ✒️
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.9.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
 
11
  ---
12
 
13
  # dcode
14
 
15
+ **Text -> Polargraph Gcode via Stable Diffusion**
16
 
17
+ Single end-to-end diffusion model that converts text prompts directly to polargraph-compatible gcode.
18
 
19
+ ## Architecture
 
 
20
 
21
+ ```
22
+ text prompt -> CLIP text encoder -> UNet diffusion -> latent [4,64,64] -> GcodeDecoder -> gcode tokens
23
+ ```
24
 
25
+ All components post-trained end-to-end on 175,952 image-gcode pairs.
 
 
 
26
 
27
+ ## Machine Specs
28
 
29
+ - Work area: 841mm x 1189mm (A0)
30
+ - Bounds: X [-420.5, 420.5], Y [-594.5, 594.5]
31
+ - Pen servo: 40 deg (down), 90 deg (up)
32
 
33
  ## Links
34
 
 
 
35
  - [GitHub](https://github.com/Twarner491/dcode)
36
+ - [Model](https://huggingface.co/twarner/dcode-sd-gcode)
37
+ - [Dataset](https://huggingface.co/datasets/twarner/dcode-polargraph-gcode)
38
+
39
+ ## License
40
+
41
+ MIT
app.py CHANGED
@@ -1,115 +1,188 @@
1
- """dcode Gradio Space - Text to Gcode via Latent Diffusion."""
2
 
3
  import re
 
 
4
  import gradio as gr
5
  import torch
 
6
  from pathlib import Path
7
 
8
  # Machine limits
9
  BOUNDS = {"left": -420.5, "right": 420.5, "top": 594.5, "bottom": -594.5}
10
 
11
- # Model caches
12
- _generator = None
13
 
14
 
15
- def get_generator():
16
- """Load and cache the latent-gcode generator."""
17
- global _generator
18
- if _generator is None:
19
- from diffusers import StableDiffusionPipeline, AutoencoderKL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  from transformers import AutoTokenizer
21
- import torch.nn as nn
22
 
23
  device = "cuda" if torch.cuda.is_available() else "cpu"
24
  dtype = torch.float16 if device == "cuda" else torch.float32
25
 
26
- print("Loading Stable Diffusion pipeline...")
27
- # Use SD 1.5 which is more reliably available
28
- pipe = StableDiffusionPipeline.from_pretrained(
29
- "runwayml/stable-diffusion-v1-5",
30
- torch_dtype=dtype,
31
- safety_checker=None,
32
- use_safetensors=True,
33
- ).to(device)
34
 
35
- print("Loading gcode decoder...")
36
- from huggingface_hub import hf_hub_download
37
-
38
- # Download model files
39
- model_path = hf_hub_download("twarner/dcode-latent-gcode", "pytorch_model.bin")
40
- config_path = hf_hub_download("twarner/dcode-latent-gcode", "config.json")
41
 
42
- import json
43
  with open(config_path) as f:
44
  config = json.load(f)
45
 
46
- # Load tokenizer
47
- tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
48
-
49
- # Build decoder model
50
- class LatentProjector(nn.Module):
51
- def __init__(self, latent_dim, hidden_size):
52
- super().__init__()
53
- self.proj = nn.Sequential(
54
- nn.Linear(latent_dim, hidden_size * 2),
55
- nn.GELU(),
56
- nn.Linear(hidden_size * 2, hidden_size),
57
- nn.LayerNorm(hidden_size),
58
- )
59
- def forward(self, x):
60
- return self.proj(x)
61
-
62
- class GcodeDecoder(nn.Module):
63
- def __init__(self, hidden_size, vocab_size, num_layers, num_heads, max_seq_len):
64
- super().__init__()
65
- self.embed = nn.Embedding(vocab_size, hidden_size)
66
- self.pos_embed = nn.Embedding(max_seq_len, hidden_size)
67
- layer = nn.TransformerDecoderLayer(hidden_size, num_heads, hidden_size * 4, batch_first=True)
68
- self.decoder = nn.TransformerDecoder(layer, num_layers)
69
- self.head = nn.Linear(hidden_size, vocab_size)
70
- self.max_seq_len = max_seq_len
71
-
72
- def forward(self, tgt, memory, tgt_mask=None):
73
- pos = torch.arange(tgt.size(1), device=tgt.device)
74
- x = self.embed(tgt) + self.pos_embed(pos)
75
- x = self.decoder(x, memory, tgt_mask=tgt_mask)
76
- return self.head(x)
77
-
78
- # Initialize models
79
- latent_dim = 4 * 64 * 64
80
- hidden_size = config.get("hidden_size", 512)
81
- vocab_size = tokenizer.vocab_size
82
- num_layers = config.get("num_layers", 6)
83
- num_heads = config.get("num_heads", 8)
84
- max_seq_len = config.get("max_seq_len", 1024)
85
 
86
- projector = LatentProjector(latent_dim, hidden_size).to(device, dtype)
87
- decoder = GcodeDecoder(hidden_size, vocab_size, num_layers, num_heads, max_seq_len).to(device, dtype)
 
 
 
 
 
 
 
 
 
 
88
 
89
  # Load weights
90
- state_dict = torch.load(model_path, map_location=device)
91
-
92
- proj_state = {k.replace("projector.", ""): v for k, v in state_dict.items() if k.startswith("projector.")}
93
- dec_state = {k.replace("decoder.", ""): v for k, v in state_dict.items() if k.startswith("decoder.")}
94
 
95
- projector.load_state_dict(proj_state)
96
- decoder.load_state_dict(dec_state)
 
 
 
97
 
98
- projector.eval()
99
- decoder.eval()
100
 
101
- _generator = {
102
  "pipe": pipe,
103
- "projector": projector,
104
- "decoder": decoder,
105
- "tokenizer": tokenizer,
106
  "device": device,
107
  "dtype": dtype,
108
- "max_seq_len": max_seq_len,
109
  }
110
- print("Models loaded!")
111
 
112
- return _generator
113
 
114
 
115
  def validate_gcode(gcode: str) -> str:
@@ -220,21 +293,19 @@ def gcode_to_svg(gcode: str) -> str:
220
 
221
 
222
  def generate(prompt: str, temperature: float, max_tokens: int, num_steps: int, guidance: float):
223
- """Generate gcode from text prompt via latent diffusion."""
224
  if not prompt or not prompt.strip():
225
  return "Enter a prompt to generate gcode", gcode_to_svg("")
226
 
227
  try:
228
- gen = get_generator()
229
- pipe = gen["pipe"]
230
- projector = gen["projector"]
231
- decoder = gen["decoder"]
232
- tokenizer = gen["tokenizer"]
233
- device = gen["device"]
234
- dtype = gen["dtype"]
235
- max_seq_len = gen["max_seq_len"]
236
 
237
- # 1. Text -> Latent via Stable Diffusion
238
  with torch.no_grad():
239
  result = pipe(
240
  prompt,
@@ -242,37 +313,22 @@ def generate(prompt: str, temperature: float, max_tokens: int, num_steps: int, g
242
  guidance_scale=guidance,
243
  output_type="latent",
244
  )
245
- latent = result.images # [1, 4, 64, 64]
246
 
247
- # 2. Latent -> Gcode via decoder
248
  with torch.no_grad():
249
- # Flatten and project latent
250
- latent_flat = latent.view(1, -1).to(dtype) # [1, 4*64*64]
251
- memory = projector(latent_flat).unsqueeze(1) # [1, 1, hidden]
252
-
253
- # Autoregressive decoding
254
- bos_id = tokenizer.bos_token_id or tokenizer.pad_token_id
255
- eos_id = tokenizer.eos_token_id
256
-
257
- tokens = torch.tensor([[bos_id]], device=device)
258
-
259
- for _ in range(min(max_tokens, max_seq_len - 1)):
260
- logits = decoder(tokens, memory)
261
- next_logits = logits[:, -1, :] / temperature
262
- probs = torch.softmax(next_logits, dim=-1)
263
- next_token = torch.multinomial(probs, 1)
264
- tokens = torch.cat([tokens, next_token], dim=1)
265
-
266
- if next_token.item() == eos_id:
267
- break
268
-
269
- gcode = tokenizer.decode(tokens[0], skip_special_tokens=True)
270
 
271
  gcode = validate_gcode(gcode)
272
  line_count = len(gcode.split("\n"))
273
  svg = gcode_to_svg(gcode)
274
 
275
- gcode_with_header = f"; dcode output - {line_count} lines\n; Prompt: {prompt}\n; Machine validated\n\n{gcode}"
276
  return gcode_with_header, svg
277
 
278
  except Exception as e:
@@ -291,11 +347,11 @@ custom_css = """
291
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="emerald")) as demo:
292
  gr.Markdown("""
293
  # dcode
294
- **Text Polargraph Gcode via Latent Diffusion**
295
 
296
- Uses Stable Diffusion to generate latents from text, then decodes to machine gcode.
297
 
298
- [GitHub](https://github.com/Twarner491/dcode) | [Model](https://huggingface.co/twarner/dcode-latent-gcode) | [Dataset](https://huggingface.co/datasets/twarner/dcode-polargraph-gcode)
299
  """)
300
 
301
  with gr.Row():
@@ -307,11 +363,11 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="emerald")) as d
307
  )
308
 
309
  with gr.Row():
310
- temperature = gr.Slider(0.5, 1.5, value=0.9, label="Temperature")
311
  max_tokens = gr.Slider(256, 1024, value=512, step=128, label="Max Tokens")
312
 
313
  with gr.Row():
314
- num_steps = gr.Slider(10, 50, value=25, step=5, label="Diffusion Steps")
315
  guidance = gr.Slider(1.0, 15.0, value=7.5, step=0.5, label="Guidance Scale")
316
 
317
  generate_btn = gr.Button("Generate", variant="primary", size="lg")
@@ -338,7 +394,7 @@ with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="emerald")) as d
338
 
339
  gr.Markdown("""
340
  ---
341
- **Machine Bounds**: X: ±420.5mm, Y: ±594.5mm | Pen servo: 40° (down) / 90° (up) | **License**: MIT
342
  """)
343
 
344
  generate_btn.click(
 
1
+ """dcode Gradio Space - Text to Gcode via SD-Gcode Diffusion."""
2
 
3
  import re
4
+ import os
5
+ import json
6
  import gradio as gr
7
  import torch
8
+ import torch.nn as nn
9
  from pathlib import Path
10
 
11
  # Machine limits
12
  BOUNDS = {"left": -420.5, "right": 420.5, "top": 594.5, "bottom": -594.5}
13
 
14
+ # Model cache
15
+ _model = None
16
 
17
 
18
+ class GcodeDecoderConfig:
19
+ """Configuration for gcode decoder."""
20
+ def __init__(
21
+ self,
22
+ latent_channels: int = 4,
23
+ latent_size: int = 64,
24
+ hidden_size: int = 768,
25
+ num_layers: int = 6,
26
+ num_heads: int = 12,
27
+ vocab_size: int = 32128,
28
+ max_seq_len: int = 1024,
29
+ dropout: float = 0.1,
30
+ ):
31
+ self.latent_channels = latent_channels
32
+ self.latent_size = latent_size
33
+ self.latent_dim = latent_channels * latent_size * latent_size
34
+ self.hidden_size = hidden_size
35
+ self.num_layers = num_layers
36
+ self.num_heads = num_heads
37
+ self.vocab_size = vocab_size
38
+ self.max_seq_len = max_seq_len
39
+ self.dropout = dropout
40
+
41
+
42
+ class GcodeDecoder(nn.Module):
43
+ """Transformer decoder: SD latent -> gcode tokens."""
44
+
45
+ def __init__(self, config: GcodeDecoderConfig):
46
+ super().__init__()
47
+ self.config = config
48
+
49
+ self.latent_proj = nn.Sequential(
50
+ nn.Linear(config.latent_dim, config.hidden_size * 4),
51
+ nn.GELU(),
52
+ nn.Linear(config.hidden_size * 4, config.hidden_size * 16),
53
+ nn.LayerNorm(config.hidden_size * 16),
54
+ )
55
+
56
+ self.token_embed = nn.Embedding(config.vocab_size, config.hidden_size)
57
+ self.pos_embed = nn.Embedding(config.max_seq_len, config.hidden_size)
58
+
59
+ decoder_layer = nn.TransformerDecoderLayer(
60
+ d_model=config.hidden_size,
61
+ nhead=config.num_heads,
62
+ dim_feedforward=config.hidden_size * 4,
63
+ dropout=config.dropout,
64
+ activation='gelu',
65
+ batch_first=True,
66
+ norm_first=True,
67
+ )
68
+ self.decoder = nn.TransformerDecoder(decoder_layer, config.num_layers)
69
+
70
+ self.ln_f = nn.LayerNorm(config.hidden_size)
71
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
72
+ self.lm_head.weight = self.token_embed.weight
73
+
74
+ def forward(self, latent: torch.Tensor, input_ids: torch.Tensor) -> torch.Tensor:
75
+ batch_size, seq_len = input_ids.shape
76
+ device = input_ids.device
77
+
78
+ latent_flat = latent.view(batch_size, -1)
79
+ memory = self.latent_proj(latent_flat)
80
+ memory = memory.view(batch_size, 16, self.config.hidden_size)
81
+
82
+ positions = torch.arange(seq_len, device=device)
83
+ x = self.token_embed(input_ids) + self.pos_embed(positions)
84
+
85
+ causal_mask = nn.Transformer.generate_square_subsequent_mask(seq_len, device=device)
86
+
87
+ x = self.decoder(x, memory, tgt_mask=causal_mask)
88
+ x = self.ln_f(x)
89
+ return self.lm_head(x)
90
+
91
+ @torch.no_grad()
92
+ def generate(self, latent, tokenizer, max_length=512, temperature=0.8, top_p=0.9):
93
+ device = latent.device
94
+ batch_size = latent.shape[0]
95
+
96
+ input_ids = torch.full((batch_size, 1), tokenizer.pad_token_id, dtype=torch.long, device=device)
97
+
98
+ for _ in range(max_length - 1):
99
+ logits = self(latent, input_ids)
100
+ next_logits = logits[:, -1, :] / temperature
101
+
102
+ sorted_logits, sorted_indices = torch.sort(next_logits, descending=True)
103
+ cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
104
+ sorted_indices_to_remove = cumulative_probs > top_p
105
+ sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
106
+ sorted_indices_to_remove[:, 0] = False
107
+
108
+ for b in range(batch_size):
109
+ next_logits[b, sorted_indices[b, sorted_indices_to_remove[b]]] = float('-inf')
110
+
111
+ probs = torch.softmax(next_logits, dim=-1)
112
+ next_token = torch.multinomial(probs, num_samples=1)
113
+ input_ids = torch.cat([input_ids, next_token], dim=1)
114
+
115
+ if next_token.item() == tokenizer.eos_token_id:
116
+ break
117
+
118
+ return tokenizer.decode(input_ids[0], skip_special_tokens=True)
119
+
120
+
121
+ def get_model():
122
+ """Load and cache the SD-Gcode model."""
123
+ global _model
124
+ if _model is None:
125
+ from diffusers import StableDiffusionPipeline
126
  from transformers import AutoTokenizer
127
+ from huggingface_hub import hf_hub_download
128
 
129
  device = "cuda" if torch.cuda.is_available() else "cpu"
130
  dtype = torch.float16 if device == "cuda" else torch.float32
131
 
132
+ print("Loading SD-Gcode model...")
 
 
 
 
 
 
 
133
 
134
+ # Download config and weights
135
+ config_path = hf_hub_download("twarner/dcode-sd-gcode", "config.json")
136
+ weights_path = hf_hub_download("twarner/dcode-sd-gcode", "pytorch_model.bin")
 
 
 
137
 
 
138
  with open(config_path) as f:
139
  config = json.load(f)
140
 
141
+ # Load SD pipeline
142
+ sd_model_id = config.get("sd_model_id", "runwayml/stable-diffusion-v1-5")
143
+ print(f"Loading SD from {sd_model_id}...")
144
+ pipe = StableDiffusionPipeline.from_pretrained(
145
+ sd_model_id,
146
+ torch_dtype=dtype,
147
+ safety_checker=None,
148
+ ).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
+ # Build gcode decoder
151
+ gcode_cfg = config.get("gcode_decoder", {})
152
+ decoder_config = GcodeDecoderConfig(
153
+ latent_channels=gcode_cfg.get("latent_channels", 4),
154
+ latent_size=gcode_cfg.get("latent_size", 64),
155
+ hidden_size=gcode_cfg.get("hidden_size", 768),
156
+ num_layers=gcode_cfg.get("num_layers", 6),
157
+ num_heads=gcode_cfg.get("num_heads", 12),
158
+ vocab_size=gcode_cfg.get("vocab_size", 32128),
159
+ max_seq_len=gcode_cfg.get("max_seq_len", 1024),
160
+ )
161
+ gcode_decoder = GcodeDecoder(decoder_config).to(device, dtype)
162
 
163
  # Load weights
164
+ state_dict = torch.load(weights_path, map_location=device)
 
 
 
165
 
166
+ # Extract decoder weights
167
+ decoder_state = {k.replace("gcode_decoder.", ""): v for k, v in state_dict.items()
168
+ if k.startswith("gcode_decoder.")}
169
+ gcode_decoder.load_state_dict(decoder_state)
170
+ gcode_decoder.eval()
171
 
172
+ # Gcode tokenizer
173
+ gcode_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
174
 
175
+ _model = {
176
  "pipe": pipe,
177
+ "gcode_decoder": gcode_decoder,
178
+ "gcode_tokenizer": gcode_tokenizer,
 
179
  "device": device,
180
  "dtype": dtype,
181
+ "num_inference_steps": config.get("num_inference_steps", 20),
182
  }
183
+ print("Model loaded!")
184
 
185
+ return _model
186
 
187
 
188
  def validate_gcode(gcode: str) -> str:
 
293
 
294
 
295
  def generate(prompt: str, temperature: float, max_tokens: int, num_steps: int, guidance: float):
296
+ """Generate gcode from text prompt via SD-Gcode diffusion."""
297
  if not prompt or not prompt.strip():
298
  return "Enter a prompt to generate gcode", gcode_to_svg("")
299
 
300
  try:
301
+ m = get_model()
302
+ pipe = m["pipe"]
303
+ gcode_decoder = m["gcode_decoder"]
304
+ gcode_tokenizer = m["gcode_tokenizer"]
305
+ device = m["device"]
306
+ dtype = m["dtype"]
 
 
307
 
308
+ # 1. Text -> Latent via full SD diffusion
309
  with torch.no_grad():
310
  result = pipe(
311
  prompt,
 
313
  guidance_scale=guidance,
314
  output_type="latent",
315
  )
316
+ latent = result.images.to(dtype) # [1, 4, 64, 64]
317
 
318
+ # 2. Latent -> Gcode via trained decoder
319
  with torch.no_grad():
320
+ gcode = gcode_decoder.generate(
321
+ latent,
322
+ gcode_tokenizer,
323
+ max_length=max_tokens,
324
+ temperature=temperature,
325
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
 
327
  gcode = validate_gcode(gcode)
328
  line_count = len(gcode.split("\n"))
329
  svg = gcode_to_svg(gcode)
330
 
331
+ gcode_with_header = f"; dcode SD-Gcode output - {line_count} lines\n; Prompt: {prompt}\n; Machine validated\n\n{gcode}"
332
  return gcode_with_header, svg
333
 
334
  except Exception as e:
 
347
  with gr.Blocks(css=custom_css, theme=gr.themes.Soft(primary_hue="emerald")) as demo:
348
  gr.Markdown("""
349
  # dcode
350
+ **Text -> Polargraph Gcode via Stable Diffusion**
351
 
352
+ Single end-to-end diffusion model: text -> CLIP -> UNet -> latent -> gcode decoder -> gcode
353
 
354
+ [GitHub](https://github.com/Twarner491/dcode) | [Model](https://huggingface.co/twarner/dcode-sd-gcode) | [Dataset](https://huggingface.co/datasets/twarner/dcode-polargraph-gcode)
355
  """)
356
 
357
  with gr.Row():
 
363
  )
364
 
365
  with gr.Row():
366
+ temperature = gr.Slider(0.5, 1.5, value=0.8, label="Temperature")
367
  max_tokens = gr.Slider(256, 1024, value=512, step=128, label="Max Tokens")
368
 
369
  with gr.Row():
370
+ num_steps = gr.Slider(10, 50, value=20, step=5, label="Diffusion Steps")
371
  guidance = gr.Slider(1.0, 15.0, value=7.5, step=0.5, label="Guidance Scale")
372
 
373
  generate_btn = gr.Button("Generate", variant="primary", size="lg")
 
394
 
395
  gr.Markdown("""
396
  ---
397
+ **Machine Bounds**: X: +/-420.5mm, Y: +/-594.5mm | Pen servo: 40 deg (down) / 90 deg (up) | **License**: MIT
398
  """)
399
 
400
  generate_btn.click(
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
- gradio>=4.44.0
2
- torch>=2.0
3
- transformers>=4.36
4
- diffusers>=0.25
5
- accelerate>=0.25
6
- huggingface_hub>=0.20
7
- safetensors>=0.4
 
1
+ gradio>=4.44.1
2
+ gradio_client==1.3.0
3
+ torch
4
+ diffusers
5
+ transformers
6
+ accelerate
7
+ huggingface_hub