#!/usr/bin/env python3 """ QUICK START: Qwen3.5-0.8B → Student (100-150M) For RTX 2050 (4GB VRAM) on Arch Linux """ import subprocess import sys from pathlib import Path import logging import time logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # ============================================================================ # STEP 0: Install Dependencies # ============================================================================ def install_dependencies(): """Install required packages with uv""" logger.info("Installing dependencies with uv...") packages = [ "torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121", "transformers>=4.40.0", "accelerate", "datasets", "bitsandbytes", # For quantization "peft", # For LoRA ] for pkg in packages: logger.info(f"Installing: {pkg}") subprocess.run([sys.executable, "-m", "pip", "install", pkg], check=False) logger.info("✓ Dependencies installed") # ============================================================================ # STEP 1: GGUF to HuggingFace Conversion # ============================================================================ def convert_gguf_to_hf(gguf_path: str, output_dir: str = "models/qwen_teacher"): """ Convert GGUF to HuggingFace format Note: This requires the model architecture config For Qwen3.5-0.8B, we can also just download from HuggingFace instead """ logger.info(f"Converting GGUF: {gguf_path}") # Option 1: Use ollama/llama.cpp to load and export try: from llama_cpp import Llama logger.info("Loading GGUF with llama.cpp...") llm = Llama(model_path=gguf_path, n_gpu_layers=-1) # Note: llama.cpp doesn't easily export to HuggingFace format logger.warning("GGUF loading for inference only. For training, use HuggingFace model instead.") return llm except ImportError: logger.error("llama-cpp-python not installed. Install with: pip install llama-cpp-python") logger.info("Alternative: Download Qwen from HuggingFace") return None # ============================================================================ # STEP 2: Download Teacher Model # ============================================================================ def download_qwen_teacher(output_dir: str = "models/teacher"): """Download Qwen teacher from HuggingFace""" logger.info("Downloading Qwen teacher model...") from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "Qwen/Qwen2.5-0.5B" # Use 0.5B as proxy for 0.8B # Alternative options: # - "Qwen/Qwen1.5-0.5B" # - "Qwen/Qwen2-0.5B" Path(output_dir).mkdir(parents=True, exist_ok=True) logger.info(f"Downloading {model_name}...") model = AutoModelForCausalLM.from_pretrained( model_name, trust_remote_code=True, device_map="auto", ) model.save_pretrained(output_dir) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.save_pretrained(output_dir) logger.info(f"✓ Model saved to {output_dir}") return output_dir # ============================================================================ # STEP 3: Prepare Training Data # ============================================================================ def prepare_dataset(dataset_name: str = "wikitext", split: str = "train", output_file: str = "data/train.txt"): """Download and prepare training data""" logger.info(f"Preparing dataset: {dataset_name}") from datasets import DownloadConfig, load_dataset Path(output_file).parent.mkdir(parents=True, exist_ok=True) logger.info(f"Loading {dataset_name}...") if dataset_name == "wikitext": # Prefer canonical repo/config names and retry transient network failures. wikitext_candidates = [ ("Salesforce/wikitext", "wikitext-2-raw-v1"), ("Salesforce/wikitext", "wikitext-2-v1"), ("wikitext", "wikitext-2-raw-v1"), ("wikitext", "wikitext-2"), ] max_attempts = 4 backoff_seconds = 2 download_config = DownloadConfig(max_retries=8) texts = None last_error = None for dataset_id, config_name in wikitext_candidates: for attempt in range(1, max_attempts + 1): try: logger.info( "Loading %s (%s), split=%s [attempt %s/%s]", dataset_id, config_name, split, attempt, max_attempts, ) dataset_split = load_dataset( dataset_id, config_name, split=split, download_config=download_config, ) texts = dataset_split["text"] break except Exception as exc: last_error = exc if attempt < max_attempts: sleep_s = backoff_seconds * attempt logger.warning( "Dataset load failed for %s (%s): %s. Retrying in %ss...", dataset_id, config_name, exc, sleep_s, ) time.sleep(sleep_s) if texts is not None: break if texts is None: raise RuntimeError( "Failed to load WikiText after retries/fallbacks. " "Please check internet connectivity and Hugging Face availability." ) from last_error elif dataset_name == "pile": dataset = load_dataset("the_pile", split=f"{split}[:5000]") # Subset texts = dataset["text"] else: logger.error(f"Unknown dataset: {dataset_name}") return None # Save to text file logger.info(f"Writing to {output_file}...") with open(output_file, 'w') as f: for text in texts: if text.strip(): f.write(text + "\n") logger.info(f"✓ Dataset saved: {output_file}") return output_file # ============================================================================ # STEP 4: Configuration # ============================================================================ def create_config_template(): """Create config.py template""" config_content = ''' # config.py - Training configuration from qwen_distill import QwenDistillationConfig class MyConfig(QwenDistillationConfig): def __init__(self): super().__init__() # Paths self.data_file = "data/train.txt" self.teacher_model_name = "Qwen/Qwen2.5-0.5B" # Student size (adjust based on your needs) # Small: 3 layers, 128 hidden = ~30M params # Medium: 5 layers, 256 hidden = ~100M params # Large: 8 layers, 384 hidden = ~250M params self.student_num_layers = 5 self.student_hidden_dim = 256 self.student_num_heads = 4 # Training self.batch_size = 2 self.gradient_accumulation_steps = 4 self.max_steps = 2000 self.learning_rate = 8e-4 # Distillation self.temperature = 3.0 self.alpha = 0.8 # 80% KD loss self.beta = 0.2 # 20% feature loss # Memory self.use_gradient_checkpointing = True self.mixed_precision = "fp16" ''' with open("config.py", 'w') as f: f.write(config_content) logger.info("✓ Created config.py template") # ============================================================================ # STEP 5: Training Script # ============================================================================ def create_train_script(): """Create training script""" train_script = '''#!/usr/bin/env python3 from qwen_distill import QwenDistillationConfig, QwenDistillationTrainer, TextDataset from torch.utils.data import DataLoader import torch # Load config config = QwenDistillationConfig() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Initialize trainer trainer = QwenDistillationTrainer(config, device) # Load data with open("data/train.txt", "r") as f: texts = [line.strip() for line in f if line.strip()] print(f"Loaded {len(texts)} text samples") # Create dataset & dataloader dataset = TextDataset(texts, trainer.tokenizer, max_length=config.max_seq_length) dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True) # Train trainer.train(dataloader) print("✓ Training complete!") print(f"Student saved to: checkpoints/student_final.pt") ''' with open("train.py", 'w') as f: f.write(train_script) logger.info("✓ Created train.py") # ============================================================================ # USAGE # ============================================================================ if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--setup", action="store_true", help="Setup environment") parser.add_argument("--download", action="store_true", help="Download teacher") parser.add_argument("--data", action="store_true", help="Prepare dataset") parser.add_argument("--config", action="store_true", help="Create config") parser.add_argument("--all", action="store_true", help="Do all steps") args = parser.parse_args() if args.setup or args.all: install_dependencies() if args.download or args.all: download_qwen_teacher() if args.data or args.all: prepare_dataset("wikitext", "train", "data/train.txt") if args.config or args.all: create_config_template() create_train_script() if args.all: logger.info(""" ✓ Setup complete! Next steps: 1. Edit config.py to customize settings 2. Run: python train.py 3. Monitor training in logs/ 4. Evaluate student model (see eval.py) """)