import lighteval from lighteval.logging.evaluation_tracker import EvaluationTracker from lighteval.models.vllm.vllm_model import VLLMModelConfig from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters from lighteval.utils.imports import is_package_available from peft import PeftModel from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig import os import torch import json if is_package_available("accelerate"): from datetime import timedelta from accelerate import Accelerator, InitProcessGroupKwargs accelerator = Accelerator(kwargs_handlers=[InitProcessGroupKwargs(timeout=timedelta(seconds=3000))]) else: accelerator = None def merge_lora_if_needed(): """Merge LoRA model and preserve RoPE scaling configuration""" merged_path = "/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Lora-Merged" if os.path.exists(os.path.join(merged_path, "config.json")): print(f"Merged model already exists at {merged_path}") # Verify RoPE scaling in existing merged model config_path = os.path.join(merged_path, "config.json") with open(config_path, 'r') as f: config = json.load(f) if 'rope_scaling' in config: print(f"✓ Existing merged model has RoPE scaling: {config['rope_scaling']}") print(f"✓ Max position embeddings: {config.get('max_position_embeddings', 'N/A')}") else: print("⚠ Warning: Existing merged model does NOT have RoPE scaling config!") print(" Deleting and re-creating with RoPE scaling...") import shutil shutil.rmtree(merged_path) return merge_lora_if_needed() # Recursive call to re-create return merged_path print("="*100) print("Merged model not found. Starting merge process...") print("="*100) lora_path = "/public/home/lshi/yoAI/projects/Online_CL/train/model_sft_save/Qwen2.5-Math-1.5B-Lora" # Step 1: Load base model print("\n[1/5] Loading base model...") base_model = AutoModelForCausalLM.from_pretrained( "Qwen/Qwen2.5-Math-1.5B", torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto" ) # Step 2: Load LoRA adapter print("\n[2/5] Loading LoRA adapter...") model = PeftModel.from_pretrained(base_model, lora_path) # Step 3: Merge and unload print("\n[3/5] Merging LoRA weights with base model...") merged_model = model.merge_and_unload() # Step 4: Save merged model print(f"\n[4/5] Saving merged model to {merged_path}...") os.makedirs(merged_path, exist_ok=True) merged_model.save_pretrained(merged_path, safe_serialization=True) # Step 5: Add RoPE scaling configuration print("\n[5/5] Adding RoPE scaling configuration...") merged_config_path = os.path.join(merged_path, "config.json") with open(merged_config_path, 'r') as f: merged_config = json.load(f) # ========== RoPE scaling: 4096 -> 8192, factor = 2.0 ========== merged_config['rope_scaling'] = { "type": "linear", "factor": 2.0 } print(f"✓ Added RoPE scaling: {merged_config['rope_scaling']}") # 更新 max_position_embeddings 从 4096 到 8192 original_max_pos = merged_config.get('max_position_embeddings', 4096) scaling_factor = merged_config['rope_scaling']['factor'] new_max_pos = int(original_max_pos * scaling_factor) merged_config['max_position_embeddings'] = new_max_pos print(f"✓ Updated max_position_embeddings: {original_max_pos} -> {new_max_pos}") # Save updated config with open(merged_config_path, 'w') as f: json.dump(merged_config, f, indent=2, ensure_ascii=False) # Save tokenizer print("Saving tokenizer...") tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B", trust_remote_code=True) tokenizer.save_pretrained(merged_path) # Clean up memory del base_model del model del merged_model torch.cuda.empty_cache() print("\n" + "="*100) print("✓ Merge completed successfully!") print(f"✓ Merged model saved to: {merged_path}") print(f"✓ RoPE scaling config: {merged_config['rope_scaling']}") print(f"✓ Max position embeddings: {merged_config['max_position_embeddings']}") print("="*100 + "\n") return merged_path def main(): # Set CUDA device FIRST before any CUDA operations os.environ["CUDA_VISIBLE_DEVICES"] = "2" os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1" print("Checking for merged model...") merged_model_path = merge_lora_if_needed() # Detect number of GPUs num_gpus = torch.cuda.device_count() print(f"\n{'='*100}") print(f"Detected {num_gpus} GPU(s)") if num_gpus > 0: for i in range(num_gpus): print(f" GPU {i}: {torch.cuda.get_device_name(i)}") print(f"{'='*100}\n") # Read the merged model config to get max_model_length config_path = os.path.join(merged_model_path, "config.json") with open(config_path, 'r') as f: model_config_dict = json.load(f) max_position_embeddings = model_config_dict.get('max_position_embeddings', 4096) rope_scaling = model_config_dict.get('rope_scaling', None) print(f"Model max_position_embeddings: {max_position_embeddings}") print(f"Model RoPE scaling config: {rope_scaling}") # 使用 8192 作为 max_model_length(你训练时扩展后的长度) max_model_length = 8192 print(f"Using max_model_length: {max_model_length}\n") print("Setting up evaluation pipeline...") evaluation_tracker = EvaluationTracker( output_dir="./results", save_details=True, push_to_hub=False, ) pipeline_params = PipelineParameters( launcher_type=ParallelismManager.ACCELERATE, custom_tasks_directory=None, max_samples=500 ) model_config = VLLMModelConfig( model_name=merged_model_path, dtype="bfloat16", max_model_length=max_model_length, # 使用 8192 trust_remote_code=True, tensor_parallel_size=num_gpus, ) task = "lighteval|math_500|0" print(f"Using {num_gpus} GPU(s) with tensor parallelism") print(f"Task: {task}") print(f"Max model length: {max_model_length}\n") print("Creating pipeline...") pipeline = Pipeline( tasks=task, pipeline_parameters=pipeline_params, evaluation_tracker=evaluation_tracker, model_config=model_config, ) # Fix generation_size print("Configuring generation parameters...") for task_name, task_obj in pipeline.tasks_dict.items(): for doc in task_obj._docs: doc.generation_size = 2048 print("\nStarting evaluation...") print("="*100) pipeline.evaluate() print("\nSaving results...") pipeline.save_and_push_results() print("\nShowing results...") pipeline.show_results() print("\n" + "="*100) print("✓ Evaluation completed!") print("="*100) if __name__ == "__main__": main()