msj19
/

mask_gdn_hrr2

Model card Files Files and versions

mask_gdn_hrr2 / create_yaml.py

msj19's picture

Add files using upload-large-folder tool

51a9d97 verified 3 months ago

history blame contribute delete

3.67 kB

	#!/usr/bin/env python3
	import os
	import yaml
	import sys

	def main():
	# 从环境变量获取值，设置默认值
	master_addr = os.getenv("MASTER_ADDR", "127.0.0.1")
	master_port = int(os.getenv("MASTER_PORT", 29500))

	# 获取节点和GPU数量，确保转换为整数
	try:
	num_nodes = int(os.getenv("SENSECORE_PYTORCH_NNODES", 1))
	except (ValueError, TypeError):
	num_nodes = 1

	try:
	gpus_per_node = int(os.getenv("SENSECORE_ACCELERATE_DEVICE_COUNT", 1))
	except (ValueError, TypeError):
	gpus_per_node = 1

	try:
	node_rank = int(os.getenv("SENSECORE_PYTORCH_NODE_RANK", 0))
	except (ValueError, TypeError):
	node_rank = 0

	# 计算总进程数
	num_processes = num_nodes * gpus_per_node

	# 配置字典
	config = {
	"compute_environment": "LOCAL_MACHINE",
	"distributed_type": "DEEPSPEED",
	"deepspeed_config": {
	"deepspeed_config_file": "configs/ds_config.json",
	"zero3_init_flag": True,
	"deepspeed_multinode_launcher": "standard",
	"deepspeed_hostfile": '/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt',
	},
	"machine_rank": node_rank,
	"main_process_ip": master_addr,
	"main_process_port": master_port,
	"main_training_function": "main",
	"num_machines": num_nodes,
	"num_processes": num_processes,
	"same_network": True,
	"use_cpu": False,
	"rdzv_backend": "c10d",
	"tpu_env": [],
	"tpu_use_cluster": False,
	"tpu_use_sudo": False,
	}

	# 打印配置信息
	print("Generated Configuration:")
	print(f" Master: {master_addr}:{master_port}")
	print(f" Number of nodes: {num_nodes}")
	print(f" GPUs per node: {gpus_per_node}")
	print(f" Total processes: {num_processes}")
	print(f" Node rank: {node_rank}")

	# 确保配置目录存在
	os.makedirs("configs", exist_ok=True)

	# 写入YAML文件
	output_file = "/mnt/jfzn/msj/flash-linear-attention/legacy/training/configs/deepspeed_sencore.yaml"
	with open(output_file, "w") as f:
	yaml.dump(config, f, default_flow_style=False)




	print(f"\nConfiguration saved to: {output_file}")

	# 同时输出文件内容供检查
	print("\nFile content:")
	with open(output_file, "r") as f:
	print(f.read())

	# 读取原始 SSH 配置

	input_file = '/mnt/jfzn/msj/flash-linear-attention/legacy/training/ssh_config/config' # 你的文件名
	output_file = '/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt' # 输出 hostfile

	hostnames = []

	with open(input_file, "r") as f:
	for line in f:
	line = line.strip()
	if line.startswith("Hostname"):
	# 提取 Hostname 后面的内容
	hostname = line.split(None, 1)[1]
	hostnames.append(hostname)

	# 写入到 hostfile，每行一个 hostname
	with open(output_file, "w") as f:
	for host in hostnames:
	f.write(host+ " slots=8\n")

	print(f"提取了 {len(hostnames)} 个 hostname，已写入 {output_file}")


	# output_path='/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt'
	# with open(output_path, "w") as f:
	# addr = master_addr
	# f.write(f"{addr} slots={8}\n")
	# q = addr.split('-')
	# q[2] = 'worker'
	# addr = '-'.join(q)
	# f.write(f"{addr} slots={8}\n")
	# print(f"hostfile 已生成: {output_path}")
	return 0

	if __name__ == "__main__":
	sys.exit(main())