gated_deltaproduct_layer17 / create_yaml.py
msj19's picture
Add files using upload-large-folder tool
c39435c verified
#!/usr/bin/env python3
import os
import yaml
import sys
def main():
# 从环境变量获取值,设置默认值
master_addr = os.getenv("MASTER_ADDR", "127.0.0.1")
master_port = int(os.getenv("MASTER_PORT", 29500))
# 获取节点和GPU数量,确保转换为整数
try:
num_nodes = int(os.getenv("SENSECORE_PYTORCH_NNODES", 1))
except (ValueError, TypeError):
num_nodes = 1
try:
gpus_per_node = int(os.getenv("SENSECORE_ACCELERATE_DEVICE_COUNT", 1))
except (ValueError, TypeError):
gpus_per_node = 1
try:
node_rank = int(os.getenv("SENSECORE_PYTORCH_NODE_RANK", 0))
except (ValueError, TypeError):
node_rank = 0
# 计算总进程数
num_processes = num_nodes * gpus_per_node
# 配置字典
config = {
"compute_environment": "LOCAL_MACHINE",
"distributed_type": "DEEPSPEED",
"deepspeed_config": {
"deepspeed_config_file": "configs/ds_config.json",
"zero3_init_flag": True,
"deepspeed_multinode_launcher": "standard",
"deepspeed_hostfile": '/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt',
},
"machine_rank": node_rank,
"main_process_ip": master_addr,
"main_process_port": master_port,
"main_training_function": "main",
"num_machines": num_nodes,
"num_processes": num_processes,
"same_network": True,
"use_cpu": False,
"rdzv_backend": "c10d",
"tpu_env": [],
"tpu_use_cluster": False,
"tpu_use_sudo": False,
}
# 打印配置信息
print("Generated Configuration:")
print(f" Master: {master_addr}:{master_port}")
print(f" Number of nodes: {num_nodes}")
print(f" GPUs per node: {gpus_per_node}")
print(f" Total processes: {num_processes}")
print(f" Node rank: {node_rank}")
# 确保配置目录存在
os.makedirs("configs", exist_ok=True)
# 写入YAML文件
output_file = "/mnt/jfzn/msj/flash-linear-attention/legacy/training/configs/deepspeed_sencore.yaml"
with open(output_file, "w") as f:
yaml.dump(config, f, default_flow_style=False)
print(f"\nConfiguration saved to: {output_file}")
# 同时输出文件内容供检查
print("\nFile content:")
with open(output_file, "r") as f:
print(f.read())
# 读取原始 SSH 配置
input_file = '/mnt/jfzn/msj/flash-linear-attention/legacy/training/ssh_config/config' # 你的文件名
output_file = '/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt' # 输出 hostfile
hostnames = []
with open(input_file, "r") as f:
for line in f:
line = line.strip()
if line.startswith("Hostname"):
# 提取 Hostname 后面的内容
hostname = line.split(None, 1)[1]
hostnames.append(hostname)
# 写入到 hostfile,每行一个 hostname
with open(output_file, "w") as f:
for host in hostnames:
f.write(host+ " slots=8\n")
print(f"提取了 {len(hostnames)} 个 hostname,已写入 {output_file}")
# output_path='/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt'
# with open(output_path, "w") as f:
# addr = master_addr
# f.write(f"{addr} slots={8}\n")
# q = addr.split('-')
# q[2] = 'worker'
# addr = '-'.join(q)
# f.write(f"{addr} slots={8}\n")
# print(f"hostfile 已生成: {output_path}")
return 0
if __name__ == "__main__":
sys.exit(main())