#!/usr/bin/env python3 """ LoRAマージ + AWQ量子化スクリプト 学習完了後に実行: 1. LoRAアダプターをベースモデルにマージ 2. AWQ量子化(4bit) 3. HuggingFaceにアップロード """ import os import sys import shutil from datetime import datetime import torch from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel from awq import AutoAWQForCausalLM # ============================================================ # 設定 # ============================================================ BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" LORA_MODEL = "hajimemat/qwen2.5-7b-glaive-fc-lora" # 学習済みLoRA # 出力先 MERGED_MODEL_DIR = "./merged_model" QUANTIZED_MODEL_DIR = "./quantized_model" OUTPUT_MODEL_ID = "hajimemat/qwen2.5-7b-glaive-fc-awq" # AWQ量子化設定 AWQ_CONFIG = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } def step1_merge_lora(): """Step 1: LoRAをベースモデルにマージ""" print("\n" + "=" * 60) print("Step 1: Merging LoRA adapter to base model") print("=" * 60) print(f"Base model: {BASE_MODEL}") print(f"LoRA model: {LORA_MODEL}") # ベースモデル読み込み print("\nLoading base model...") base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True, ) # トークナイザー読み込み print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) # LoRAアダプター適用 print("Loading LoRA adapter...") model = PeftModel.from_pretrained(base_model, LORA_MODEL) # マージ print("Merging LoRA weights...") model = model.merge_and_unload() # 保存 print(f"Saving merged model to {MERGED_MODEL_DIR}...") model.save_pretrained(MERGED_MODEL_DIR, safe_serialization=True) tokenizer.save_pretrained(MERGED_MODEL_DIR) # メモリ解放 del model del base_model torch.cuda.empty_cache() print("✅ Step 1 complete: LoRA merged") return MERGED_MODEL_DIR def step2_quantize_awq(merged_model_path): """Step 2: AWQ量子化""" print("\n" + "=" * 60) print("Step 2: AWQ Quantization (4-bit)") print("=" * 60) print(f"Input model: {merged_model_path}") print(f"AWQ config: {AWQ_CONFIG}") # モデル読み込み print("\nLoading merged model for quantization...") model = AutoAWQForCausalLM.from_pretrained( merged_model_path, trust_remote_code=True, safetensors=True, ) tokenizer = AutoTokenizer.from_pretrained(merged_model_path) # 量子化用キャリブレーションデータ # シンプルなサンプルで十分 calib_data = [ "Hello, how are you today?", "What is the weather like?", "Can you help me with coding?", "Search for files containing docker", "List all repositories in the project", "Find the definition of the function main", "こんにちは、今日の天気はどうですか?", "プロジェクトのファイル構成を教えてください", ] # 量子化実行 print("\nQuantizing model (this may take a while)...") model.quantize( tokenizer, quant_config=AWQ_CONFIG, calib_data=calib_data, ) # 保存 print(f"\nSaving quantized model to {QUANTIZED_MODEL_DIR}...") model.save_quantized(QUANTIZED_MODEL_DIR, safetensors=True) tokenizer.save_pretrained(QUANTIZED_MODEL_DIR) print("✅ Step 2 complete: AWQ quantization done") return QUANTIZED_MODEL_DIR def step3_upload_to_hub(quantized_model_path): """Step 3: HuggingFaceにアップロード""" print("\n" + "=" * 60) print("Step 3: Upload to HuggingFace Hub") print("=" * 60) print(f"Uploading to: {OUTPUT_MODEL_ID}") from huggingface_hub import HfApi, upload_folder api = HfApi() # リポジトリ作成(存在しなければ) try: api.create_repo(OUTPUT_MODEL_ID, private=True, exist_ok=True) except Exception as e: print(f"Note: {e}") # アップロード print("Uploading files...") upload_folder( folder_path=quantized_model_path, repo_id=OUTPUT_MODEL_ID, repo_type="model", ) print(f"✅ Step 3 complete: Uploaded to https://huggingface.co/{OUTPUT_MODEL_ID}") def main(): print("\n" + "=" * 70) print(" LoRA Merge + AWQ Quantization Pipeline") print("=" * 70) print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print(f"Base: {BASE_MODEL}") print(f"LoRA: {LORA_MODEL}") print(f"Output: {OUTPUT_MODEL_ID}") print("=" * 70) # GPU確認 if torch.cuda.is_available(): print(f"\nGPU: {torch.cuda.get_device_name(0)}") print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB") else: print("WARNING: No GPU available, this will be slow!") # Step 1: マージ merged_path = step1_merge_lora() # Step 2: 量子化 quantized_path = step2_quantize_awq(merged_path) # Step 3: アップロード step3_upload_to_hub(quantized_path) # クリーンアップ(オプション) print("\n" + "=" * 60) print("Cleanup") print("=" * 60) cleanup = input("Delete intermediate files? (merged_model/) [y/N]: ").strip().lower() if cleanup == 'y': shutil.rmtree(MERGED_MODEL_DIR, ignore_errors=True) print("Cleaned up merged_model/") print("\n" + "=" * 70) print("🎉 Pipeline complete!") print(f"Model available at: https://huggingface.co/{OUTPUT_MODEL_ID}") print("=" * 70) if __name__ == "__main__": main()