#!/usr/bin/env python3 import os import yaml import sys def main(): # 从环境变量获取值,设置默认值 master_addr = os.getenv("MASTER_ADDR", "127.0.0.1") master_port = int(os.getenv("MASTER_PORT", 29500)) # 获取节点和GPU数量,确保转换为整数 try: num_nodes = int(os.getenv("SENSECORE_PYTORCH_NNODES", 1)) except (ValueError, TypeError): num_nodes = 1 try: gpus_per_node = int(os.getenv("SENSECORE_ACCELERATE_DEVICE_COUNT", 1)) except (ValueError, TypeError): gpus_per_node = 1 try: node_rank = int(os.getenv("SENSECORE_PYTORCH_NODE_RANK", 0)) except (ValueError, TypeError): node_rank = 0 # 计算总进程数 num_processes = num_nodes * gpus_per_node # 配置字典 config = { "compute_environment": "LOCAL_MACHINE", "distributed_type": "DEEPSPEED", "deepspeed_config": { "deepspeed_config_file": "configs/ds_config.json", "zero3_init_flag": True, "deepspeed_multinode_launcher": "standard", "deepspeed_hostfile": '/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt', }, "machine_rank": node_rank, "main_process_ip": master_addr, "main_process_port": master_port, "main_training_function": "main", "num_machines": num_nodes, "num_processes": num_processes, "same_network": True, "use_cpu": False, "rdzv_backend": "c10d", "tpu_env": [], "tpu_use_cluster": False, "tpu_use_sudo": False, } # 打印配置信息 print("Generated Configuration:") print(f" Master: {master_addr}:{master_port}") print(f" Number of nodes: {num_nodes}") print(f" GPUs per node: {gpus_per_node}") print(f" Total processes: {num_processes}") print(f" Node rank: {node_rank}") # 确保配置目录存在 os.makedirs("configs", exist_ok=True) # 写入YAML文件 output_file = "/mnt/jfzn/msj/flash-linear-attention/legacy/training/configs/deepspeed_sencore.yaml" with open(output_file, "w") as f: yaml.dump(config, f, default_flow_style=False) print(f"\nConfiguration saved to: {output_file}") # 同时输出文件内容供检查 print("\nFile content:") with open(output_file, "r") as f: print(f.read()) # 读取原始 SSH 配置 input_file = '/mnt/jfzn/msj/flash-linear-attention/legacy/training/ssh_config/config' # 你的文件名 output_file = '/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt' # 输出 hostfile hostnames = [] with open(input_file, "r") as f: for line in f: line = line.strip() if line.startswith("Hostname"): # 提取 Hostname 后面的内容 hostname = line.split(None, 1)[1] hostnames.append(hostname) # 写入到 hostfile,每行一个 hostname with open(output_file, "w") as f: for host in hostnames: f.write(host+ " slots=8\n") print(f"提取了 {len(hostnames)} 个 hostname,已写入 {output_file}") # output_path='/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt' # with open(output_path, "w") as f: # addr = master_addr # f.write(f"{addr} slots={8}\n") # q = addr.split('-') # q[2] = 'worker' # addr = '-'.join(q) # f.write(f"{addr} slots={8}\n") # print(f"hostfile 已生成: {output_path}") return 0 if __name__ == "__main__": sys.exit(main())