Megatron-LM / tasks /model_loading_example.py

Upload folder using huggingface_hub

88e6849 verified about 1 month ago

10 kB

	#!/usr/bin/env python3
	"""
	示例：手动重现 lm-evaluation-harness 的模型加载流程

	这个脚本展示了 lm-evaluation-harness 如何从 model_args 中读取配置并构建模型。
	你可以运行这个脚本来理解整个流程。
	"""

	import sys
	import os

	# 添加 lm-evaluation-harness 到路径
	sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../../lm-evaluation-harness'))

	import torch
	import transformers
	from lm_eval import utils
	from lm_eval.api.registry import get_model


	def step1_parse_args_string():
	"""步骤1: 解析 model_args 字符串"""
	print("=" * 80)
	print("步骤1: 解析 model_args 字符串")
	print("=" * 80)

	# 模拟命令行参数: --model_args "pretrained=EleutherAI/gpt-j-6B,dtype=float,revision=main"
	model_args_string = "pretrained=EleutherAI/gpt-j-6B,dtype=float,revision=main"

	print(f"输入字符串: {model_args_string}")

	# 使用 lm-eval 的解析函数
	parsed_args = utils.simple_parse_args_string(model_args_string)

	print(f"解析后的字典: {parsed_args}")
	print()

	return parsed_args


	def step2_get_model_class():
	"""步骤2: 获取模型类"""
	print("=" * 80)
	print("步骤2: 获取模型类")
	print("=" * 80)

	# 模拟命令行参数: --model hf
	model_name = "hf"

	print(f"模型名称: {model_name}")

	# 从注册表获取模型类
	model_class = get_model(model_name)

	print(f"获取到的模型类: {model_class}")
	print(f"类名: {model_class.__name__}")
	print()

	return model_class


	def step3_load_config(pretrained, revision="main", trust_remote_code=False):
	"""步骤3: 加载模型配置"""
	print("=" * 80)
	print("步骤3: 加载模型配置")
	print("=" * 80)

	print(f"模型路径/名称: {pretrained}")
	print(f"Revision: {revision}")
	print(f"Trust remote code: {trust_remote_code}")

	# 使用 transformers.AutoConfig.from_pretrained 加载配置
	# 这是 lm-eval 在 _get_config() 中做的事情
	config = transformers.AutoConfig.from_pretrained(
	pretrained,
	revision=revision,
	trust_remote_code=trust_remote_code,
	)

	print(f"\n配置类型: {type(config)}")
	print(f"模型类型: {config.model_type}")
	print(f"词汇表大小: {config.vocab_size}")
	print(f"隐藏层大小: {getattr(config, 'n_embd', getattr(config, 'hidden_size', 'N/A'))}")
	print(f"层数: {getattr(config, 'n_layer', getattr(config, 'num_hidden_layers', 'N/A'))}")
	print(f"注意力头数: {getattr(config, 'n_head', getattr(config, 'num_attention_heads', 'N/A'))}")

	# 显示配置文件的路径
	if hasattr(config, '_name_or_path'):
	print(f"\n配置来源: {config._name_or_path}")

	print()
	return config


	def step4_load_tokenizer(pretrained, revision="main", trust_remote_code=False):
	"""步骤4: 加载 tokenizer"""
	print("=" * 80)
	print("步骤4: 加载 Tokenizer")
	print("=" * 80)

	print(f"模型路径/名称: {pretrained}")

	# 使用 transformers.AutoTokenizer.from_pretrained 加载 tokenizer
	# 这是 lm-eval 在 _create_tokenizer() 中做的事情
	tokenizer = transformers.AutoTokenizer.from_pretrained(
	pretrained,
	revision=revision,
	trust_remote_code=trust_remote_code,
	use_fast=True,
	)

	print(f"Tokenizer 类型: {type(tokenizer)}")
	print(f"词汇表大小: {tokenizer.vocab_size}")
	print(f"特殊 token:")
	print(f" - BOS: {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})")
	print(f" - EOS: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")
	print(f" - PAD: {tokenizer.pad_token} (ID: {tokenizer.pad_token_id})")

	print()
	return tokenizer


	def step5_load_model(pretrained, config, dtype="float", revision="main", trust_remote_code=False):
	"""步骤5: 加载模型"""
	print("=" * 80)
	print("步骤5: 加载模型")
	print("=" * 80)

	print(f"模型路径/名称: {pretrained}")
	print(f"数据类型: {dtype}")

	# 转换 dtype 字符串为 torch.dtype
	if dtype == "float":
	torch_dtype = torch.float32
	elif dtype == "float16":
	torch_dtype = torch.float16
	elif dtype == "bfloat16":
	torch_dtype = torch.bfloat16
	else:
	torch_dtype = torch.float32

	print(f"Torch dtype: {torch_dtype}")

	# 根据配置确定模型类
	# 这是 lm-eval 在 _get_backend() 和 _create_model() 中做的事情
	model_type = config.model_type

	# 选择对应的模型类
	if model_type in ["gpt2", "gptj", "gpt_neox", "llama", "mistral", "mixtral"]:
	model_class = transformers.AutoModelForCausalLM
	print(f"使用模型类: AutoModelForCausalLM (因果语言模型)")
	elif model_type in ["t5", "bart"]:
	model_class = transformers.AutoModelForSeq2SeqLM
	print(f"使用模型类: AutoModelForSeq2SeqLM (序列到序列模型)")
	else:
	model_class = transformers.AutoModelForCausalLM
	print(f"使用默认模型类: AutoModelForCausalLM")

	print(f"\n开始加载模型（这可能需要一些时间）...")

	# 使用 from_pretrained 加载模型
	# 这是 lm-eval 在 _create_model() 中做的事情
	try:
	model = model_class.from_pretrained(
	pretrained,
	revision=revision,
	torch_dtype=torch_dtype,
	trust_remote_code=trust_remote_code,
	device_map="auto", # 自动分配设备
	)

	print(f"模型类型: {type(model)}")
	print(f"模型参数数量: {sum(p.numel() for p in model.parameters()):,}")
	print(f"可训练参数数量: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

	# 显示模型设备
	if hasattr(model, 'device'):
	print(f"模型设备: {model.device}")
	elif hasattr(model, 'hf_device_map'):
	print(f"模型设备映射: {model.hf_device_map}")

	except Exception as e:
	print(f"加载模型时出错: {e}")
	print("注意: 这可能是由于内存不足或模型文件不存在")
	model = None

	print()
	return model


	def step6_create_hflm_instance():
	"""步骤6: 使用 lm-eval 创建完整的 HFLM 实例"""
	print("=" * 80)
	print("步骤6: 使用 lm-eval 创建完整的 HFLM 实例")
	print("=" * 80)

	# 模拟完整的命令行参数
	model_name = "hf"
	model_args_string = "pretrained=EleutherAI/gpt-j-6B,dtype=float"

	print(f"模型名称: {model_name}")
	print(f"模型参数: {model_args_string}")

	# 获取模型类
	model_class = get_model(model_name)

	# 使用 create_from_arg_string 创建实例（这是 lm-eval 实际使用的方法）
	print("\n使用 lm-eval 的 create_from_arg_string 创建模型实例...")

	try:
	# 注意: 实际创建模型需要时间和资源
	# 这里只是展示流程，不实际加载大模型
	print("（跳过实际模型加载，因为这需要大量内存和时间）")
	print("实际代码:")
	print(f" lm = {model_class.__name__}.create_from_arg_string(")
	print(f" '{model_args_string}',")
	print(f" {{'batch_size': 8, 'device': 'cuda'}}")
	print(f" )")

	except Exception as e:
	print(f"创建实例时出错: {e}")

	print()


	def main():
	"""主函数：演示完整的模型加载流程"""
	print("\n" + "=" * 80)
	print("lm-evaluation-harness 模型加载流程演示")
	print("=" * 80 + "\n")

	# 步骤1: 解析参数
	parsed_args = step1_parse_args_string()

	# 步骤2: 获取模型类
	model_class = step2_get_model_class()

	# 步骤3: 加载配置
	pretrained = parsed_args.get("pretrained", "EleutherAI/gpt-j-6B")
	revision = parsed_args.get("revision", "main")
	trust_remote_code = parsed_args.get("trust_remote_code", False)

	print("\n注意: 以下步骤需要网络连接来下载模型文件")
	print("如果模型已缓存，将使用缓存版本\n")

	try:
	config = step3_load_config(pretrained, revision, trust_remote_code)

	# 步骤4: 加载 tokenizer
	tokenizer = step4_load_tokenizer(pretrained, revision, trust_remote_code)

	# 步骤5: 加载模型（可选，需要大量内存）
	dtype = parsed_args.get("dtype", "float")

	print("\n是否加载完整模型？这需要大量内存和时间。")
	print("（在实际使用中，lm-eval 会自动执行这一步）")
	load_full_model = False # 设置为 True 来实际加载模型

	if load_full_model:
	model = step5_load_model(pretrained, config, dtype, revision, trust_remote_code)
	else:
	print("跳过完整模型加载（设置 load_full_model=True 来实际加载）")
	print()

	# 步骤6: 展示如何使用 lm-eval 创建实例
	step6_create_hflm_instance()

	except Exception as e:
	print(f"\n错误: {e}")
	print("这可能是因为:")
	print("1. 网络连接问题（无法下载模型）")
	print("2. 模型名称不存在")
	print("3. 缺少必要的依赖")
	import traceback
	traceback.print_exc()

	print("\n" + "=" * 80)
	print("演示完成！")
	print("=" * 80)
	print("\n关键要点:")
	print("1. model_args 字符串被解析为字典")
	print("2. 根据 --model 参数获取对应的模型类（如 HFLM）")
	print("3. 使用 AutoConfig.from_pretrained() 加载 config.json")
	print("4. 使用 AutoTokenizer.from_pretrained() 加载 tokenizer")
	print("5. 使用 AutoModelForCausalLM.from_pretrained() 加载模型权重")
	print("\n详细说明请参考: LM_EVAL_MODEL_LOADING.md")


	if __name__ == "__main__":
	main()