| |
| """ |
| QUICK START: Qwen3.5-0.8B → Student (100-150M) |
| For RTX 2050 (4GB VRAM) on Arch Linux |
| """ |
|
|
| import subprocess |
| import sys |
| from pathlib import Path |
| import logging |
| import time |
|
|
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| |
| |
| |
|
|
| def install_dependencies(): |
| """Install required packages with uv""" |
| logger.info("Installing dependencies with uv...") |
| |
| packages = [ |
| "torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121", |
| "transformers>=4.40.0", |
| "accelerate", |
| "datasets", |
| "bitsandbytes", |
| "peft", |
| ] |
| |
| for pkg in packages: |
| logger.info(f"Installing: {pkg}") |
| subprocess.run([sys.executable, "-m", "pip", "install", pkg], check=False) |
| |
| logger.info("✓ Dependencies installed") |
|
|
|
|
| |
| |
| |
|
|
| def convert_gguf_to_hf(gguf_path: str, output_dir: str = "models/qwen_teacher"): |
| """ |
| Convert GGUF to HuggingFace format |
| Note: This requires the model architecture config |
| |
| For Qwen3.5-0.8B, we can also just download from HuggingFace instead |
| """ |
| logger.info(f"Converting GGUF: {gguf_path}") |
| |
| |
| try: |
| from llama_cpp import Llama |
| logger.info("Loading GGUF with llama.cpp...") |
| |
| llm = Llama(model_path=gguf_path, n_gpu_layers=-1) |
| |
| logger.warning("GGUF loading for inference only. For training, use HuggingFace model instead.") |
| return llm |
| |
| except ImportError: |
| logger.error("llama-cpp-python not installed. Install with: pip install llama-cpp-python") |
| logger.info("Alternative: Download Qwen from HuggingFace") |
| return None |
|
|
|
|
| |
| |
| |
|
|
| def download_qwen_teacher(output_dir: str = "models/teacher"): |
| """Download Qwen teacher from HuggingFace""" |
| logger.info("Downloading Qwen teacher model...") |
| |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| |
| model_name = "Qwen/Qwen2.5-0.5B" |
| |
| |
| |
| |
| Path(output_dir).mkdir(parents=True, exist_ok=True) |
| |
| logger.info(f"Downloading {model_name}...") |
| model = AutoModelForCausalLM.from_pretrained( |
| model_name, |
| trust_remote_code=True, |
| device_map="auto", |
| ) |
| model.save_pretrained(output_dir) |
| |
| tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
| tokenizer.save_pretrained(output_dir) |
| |
| logger.info(f"✓ Model saved to {output_dir}") |
| return output_dir |
|
|
|
|
| |
| |
| |
|
|
| def prepare_dataset(dataset_name: str = "wikitext", split: str = "train", output_file: str = "data/train.txt"): |
| """Download and prepare training data""" |
| logger.info(f"Preparing dataset: {dataset_name}") |
| |
| from datasets import DownloadConfig, load_dataset |
| |
| Path(output_file).parent.mkdir(parents=True, exist_ok=True) |
| |
| logger.info(f"Loading {dataset_name}...") |
| if dataset_name == "wikitext": |
| |
| wikitext_candidates = [ |
| ("Salesforce/wikitext", "wikitext-2-raw-v1"), |
| ("Salesforce/wikitext", "wikitext-2-v1"), |
| ("wikitext", "wikitext-2-raw-v1"), |
| ("wikitext", "wikitext-2"), |
| ] |
| max_attempts = 4 |
| backoff_seconds = 2 |
| download_config = DownloadConfig(max_retries=8) |
|
|
| texts = None |
| last_error = None |
| for dataset_id, config_name in wikitext_candidates: |
| for attempt in range(1, max_attempts + 1): |
| try: |
| logger.info( |
| "Loading %s (%s), split=%s [attempt %s/%s]", |
| dataset_id, |
| config_name, |
| split, |
| attempt, |
| max_attempts, |
| ) |
| dataset_split = load_dataset( |
| dataset_id, |
| config_name, |
| split=split, |
| download_config=download_config, |
| ) |
| texts = dataset_split["text"] |
| break |
| except Exception as exc: |
| last_error = exc |
| if attempt < max_attempts: |
| sleep_s = backoff_seconds * attempt |
| logger.warning( |
| "Dataset load failed for %s (%s): %s. Retrying in %ss...", |
| dataset_id, |
| config_name, |
| exc, |
| sleep_s, |
| ) |
| time.sleep(sleep_s) |
| if texts is not None: |
| break |
|
|
| if texts is None: |
| raise RuntimeError( |
| "Failed to load WikiText after retries/fallbacks. " |
| "Please check internet connectivity and Hugging Face availability." |
| ) from last_error |
| elif dataset_name == "pile": |
| dataset = load_dataset("the_pile", split=f"{split}[:5000]") |
| texts = dataset["text"] |
| else: |
| logger.error(f"Unknown dataset: {dataset_name}") |
| return None |
| |
| |
| logger.info(f"Writing to {output_file}...") |
| with open(output_file, 'w') as f: |
| for text in texts: |
| if text.strip(): |
| f.write(text + "\n") |
| |
| logger.info(f"✓ Dataset saved: {output_file}") |
| return output_file |
|
|
|
|
| |
| |
| |
|
|
| def create_config_template(): |
| """Create config.py template""" |
| config_content = ''' |
| # config.py - Training configuration |
| from qwen_distill import QwenDistillationConfig |
| |
| class MyConfig(QwenDistillationConfig): |
| def __init__(self): |
| super().__init__() |
| |
| # Paths |
| self.data_file = "data/train.txt" |
| self.teacher_model_name = "Qwen/Qwen2.5-0.5B" |
| |
| # Student size (adjust based on your needs) |
| # Small: 3 layers, 128 hidden = ~30M params |
| # Medium: 5 layers, 256 hidden = ~100M params |
| # Large: 8 layers, 384 hidden = ~250M params |
| |
| self.student_num_layers = 5 |
| self.student_hidden_dim = 256 |
| self.student_num_heads = 4 |
| |
| # Training |
| self.batch_size = 2 |
| self.gradient_accumulation_steps = 4 |
| self.max_steps = 2000 |
| self.learning_rate = 8e-4 |
| |
| # Distillation |
| self.temperature = 3.0 |
| self.alpha = 0.8 # 80% KD loss |
| self.beta = 0.2 # 20% feature loss |
| |
| # Memory |
| self.use_gradient_checkpointing = True |
| self.mixed_precision = "fp16" |
| ''' |
| |
| with open("config.py", 'w') as f: |
| f.write(config_content) |
| |
| logger.info("✓ Created config.py template") |
|
|
|
|
| |
| |
| |
|
|
| def create_train_script(): |
| """Create training script""" |
| train_script = '''#!/usr/bin/env python3 |
| from qwen_distill import QwenDistillationConfig, QwenDistillationTrainer, TextDataset |
| from torch.utils.data import DataLoader |
| import torch |
| |
| # Load config |
| config = QwenDistillationConfig() |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| |
| # Initialize trainer |
| trainer = QwenDistillationTrainer(config, device) |
| |
| # Load data |
| with open("data/train.txt", "r") as f: |
| texts = [line.strip() for line in f if line.strip()] |
| |
| print(f"Loaded {len(texts)} text samples") |
| |
| # Create dataset & dataloader |
| dataset = TextDataset(texts, trainer.tokenizer, max_length=config.max_seq_length) |
| dataloader = DataLoader(dataset, batch_size=config.batch_size, shuffle=True) |
| |
| # Train |
| trainer.train(dataloader) |
| |
| print("✓ Training complete!") |
| print(f"Student saved to: checkpoints/student_final.pt") |
| ''' |
| |
| with open("train.py", 'w') as f: |
| f.write(train_script) |
| |
| logger.info("✓ Created train.py") |
|
|
|
|
| |
| |
| |
|
|
| if __name__ == "__main__": |
| import argparse |
| |
| parser = argparse.ArgumentParser() |
| parser.add_argument("--setup", action="store_true", help="Setup environment") |
| parser.add_argument("--download", action="store_true", help="Download teacher") |
| parser.add_argument("--data", action="store_true", help="Prepare dataset") |
| parser.add_argument("--config", action="store_true", help="Create config") |
| parser.add_argument("--all", action="store_true", help="Do all steps") |
| |
| args = parser.parse_args() |
| |
| if args.setup or args.all: |
| install_dependencies() |
| |
| if args.download or args.all: |
| download_qwen_teacher() |
| |
| if args.data or args.all: |
| prepare_dataset("wikitext", "train", "data/train.txt") |
| |
| if args.config or args.all: |
| create_config_template() |
| create_train_script() |
| |
| if args.all: |
| logger.info(""" |
| ✓ Setup complete! |
| |
| Next steps: |
| 1. Edit config.py to customize settings |
| 2. Run: python train.py |
| 3. Monitor training in logs/ |
| 4. Evaluate student model (see eval.py) |
| """) |
|
|