""" CPU-Friendly Training Script for GLM-4.5V CAD Generation Simplified version for testing and development """ import torch from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer from peft import LoraConfig, get_peft_model, TaskType from PIL import Image import json import os from dataclasses import dataclass from typing import Dict, List # Simple configuration for CPU testing CONFIG = { "base_model": "microsoft/DialoGPT-small", # Small model for CPU testing "dataset_name": "CADCODER/GenCAD-Code", "output_dir": "./test-cad-model", "max_samples": 50, # Very small for CPU "batch_size": 1, "gradient_accumulation": 4, "epochs": 1, "learning_rate": 5e-5, "max_length": 512 } @dataclass class SimpleDataCollator: """Simple data collator for text-only training.""" tokenizer: any max_length: int = 512 def __call__(self, features: List[Dict]) -> Dict[str, torch.Tensor]: # Extract texts texts = [f["text"] for f in features] # Tokenize batch = self.tokenizer( texts, return_tensors="pt", padding=True, truncation=True, max_length=self.max_length ) # Create labels for causal LM batch["labels"] = batch["input_ids"].clone() batch["labels"][batch["labels"] == self.tokenizer.pad_token_id] = -100 return batch def prepare_simple_dataset(dataset_name: str, max_samples: int = 50): """Prepare a simplified text-only dataset for CPU training.""" print(f"๐Ÿ“Š Loading dataset: {dataset_name}") try: # Load small subset dataset = load_dataset(dataset_name, split=f"train[:{max_samples}]") def create_text_examples(examples): """Convert to text-only format.""" texts = [] for i in range(len(examples["code"])): # Create simple prompt-response format text = f"Generate CADQuery code:\n{examples['code'][i]}<|endoftext|>" texts.append(text) return {"text": texts} # Process dataset dataset = dataset.map( create_text_examples, batched=True, remove_columns=dataset.column_names ) print(f"โœ… Dataset prepared: {len(dataset)} samples") return dataset except Exception as e: print(f"โŒ Dataset loading failed: {e}") # Create dummy dataset for testing print("๐Ÿ”„ Creating dummy dataset for testing...") dummy_codes = [ "import cadquery as cq\nresult = cq.Workplane('XY').box(10, 10, 5)", "import cadquery as cq\nresult = cq.Workplane('XY').cylinder(5, 10)", "import cadquery as cq\nresult = cq.Workplane('XY').box(20, 15, 8).fillet(2)", ] texts = [f"Generate CADQuery code:\n{code}<|endoftext|>" for code in dummy_codes] from datasets import Dataset dataset = Dataset.from_dict({"text": texts * (max_samples // 3 + 1)}) dataset = dataset.select(range(max_samples)) print(f"โœ… Dummy dataset created: {len(dataset)} samples") return dataset def setup_simple_model(model_name: str): """Set up a simple model for CPU training.""" print(f"๐Ÿ”ง Loading model: {model_name}") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) # Add pad token if missing if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Load model for CPU model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, # Use float32 for CPU device_map="cpu" ) # Simple LoRA config for CPU lora_config = LoraConfig( r=8, # Small rank for CPU lora_alpha=16, lora_dropout=0.1, bias="none", task_type=TaskType.CAUSAL_LM, target_modules=["c_attn", "c_proj"] # DialoGPT modules ) # Apply LoRA model = get_peft_model(model, lora_config) # Print parameters trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad) total_params = sum(p.numel() for p in model.parameters()) print(f"๐Ÿ’ก Trainable: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)") return model, tokenizer def train_simple_model(model, tokenizer, dataset, config): """Train the model with simple settings.""" print("๐Ÿ‹๏ธ Starting CPU training...") # Training arguments for CPU training_args = TrainingArguments( output_dir=config["output_dir"], per_device_train_batch_size=config["batch_size"], gradient_accumulation_steps=config["gradient_accumulation"], num_train_epochs=config["epochs"], learning_rate=config["learning_rate"], warmup_steps=10, logging_steps=5, save_steps=100, evaluation_strategy="no", save_total_limit=1, remove_unused_columns=False, report_to="none", fp16=False, # No FP16 on CPU dataloader_pin_memory=False, use_cpu=True ) # Data collator data_collator = SimpleDataCollator( tokenizer=tokenizer, max_length=config["max_length"] ) # Trainer trainer = Trainer( model=model, args=training_args, train_dataset=dataset, data_collator=data_collator, tokenizer=tokenizer ) # Train print("โณ Training will take a few minutes on CPU...") trainer.train() # Save trainer.save_model() tokenizer.save_pretrained(config["output_dir"]) print(f"โœ… Training complete! Model saved to {config['output_dir']}") return trainer def test_simple_model(model_path: str): """Test the trained model.""" print(f"๐Ÿงช Testing model: {model_path}") try: # Load model tokenizer = AutoTokenizer.from_pretrained(model_path) model = AutoModelForCausalLM.from_pretrained(model_path) # Test generation prompt = "Generate CADQuery code:" inputs = tokenizer.encode(prompt, return_tensors="pt") with torch.no_grad(): outputs = model.generate( inputs, max_new_tokens=100, temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id ) generated = tokenizer.decode(outputs[0], skip_special_tokens=True) print("๐ŸŽฏ Generated:") print(generated) return generated except Exception as e: print(f"โŒ Testing failed: {e}") return str(e) def main(): """Main training pipeline for CPU.""" print("๐Ÿš€ Starting CPU Training Pipeline") print("=" * 50) try: # 1. Prepare dataset print("\n๐Ÿ“Š Step 1: Preparing dataset...") dataset = prepare_simple_dataset(CONFIG["dataset_name"], CONFIG["max_samples"]) # 2. Setup model print("\n๐Ÿ”ง Step 2: Setting up model...") model, tokenizer = setup_simple_model(CONFIG["base_model"]) # 3. Train print("\n๐Ÿ‹๏ธ Step 3: Training...") trainer = train_simple_model(model, tokenizer, dataset, CONFIG) # 4. Test print("\n๐Ÿงช Step 4: Testing...") test_simple_model(CONFIG["output_dir"]) print("\n๐ŸŽ‰ Pipeline complete!") print(f"Model saved to: {CONFIG['output_dir']}") return True except Exception as e: print(f"\nโŒ Pipeline failed: {e}") return False if __name__ == "__main__": success = main() if success: print("\n๐Ÿ“ Next steps:") print("1. Check the generated model in ./test-cad-model/") print("2. Run test_simple_model() to generate more examples") print("3. Once working, move to GPU version") else: print("\n๐Ÿ”ง Troubleshooting:") print("1. Check internet connection for dataset download") print("2. Ensure you have enough disk space") print("3. Try reducing max_samples to 10")