File size: 3,671 Bytes
c39435c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
import os
import yaml
import sys

def main():
    # 从环境变量获取值,设置默认值
    master_addr = os.getenv("MASTER_ADDR", "127.0.0.1")
    master_port = int(os.getenv("MASTER_PORT", 29500))
    
    # 获取节点和GPU数量,确保转换为整数
    try:
        num_nodes = int(os.getenv("SENSECORE_PYTORCH_NNODES", 1))
    except (ValueError, TypeError):
        num_nodes = 1
    
    try:
        gpus_per_node = int(os.getenv("SENSECORE_ACCELERATE_DEVICE_COUNT", 1))
    except (ValueError, TypeError):
        gpus_per_node = 1
    
    try:
        node_rank = int(os.getenv("SENSECORE_PYTORCH_NODE_RANK", 0))
    except (ValueError, TypeError):
        node_rank = 0
    
    # 计算总进程数
    num_processes = num_nodes * gpus_per_node
    
    # 配置字典
    config = {
        "compute_environment": "LOCAL_MACHINE",
        "distributed_type": "DEEPSPEED",
        "deepspeed_config": {
            "deepspeed_config_file": "configs/ds_config.json",
            "zero3_init_flag": True,
            "deepspeed_multinode_launcher": "standard",
            "deepspeed_hostfile": '/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt',
        },
        "machine_rank": node_rank,
        "main_process_ip": master_addr,
        "main_process_port": master_port,
        "main_training_function": "main",
        "num_machines": num_nodes,
        "num_processes": num_processes,
        "same_network": True,
        "use_cpu": False,
        "rdzv_backend": "c10d",
        "tpu_env": [],
        "tpu_use_cluster": False,
        "tpu_use_sudo": False,
    }
    
    # 打印配置信息
    print("Generated Configuration:")
    print(f"  Master: {master_addr}:{master_port}")
    print(f"  Number of nodes: {num_nodes}")
    print(f"  GPUs per node: {gpus_per_node}")
    print(f"  Total processes: {num_processes}")
    print(f"  Node rank: {node_rank}")
    
    # 确保配置目录存在
    os.makedirs("configs", exist_ok=True)
    
    # 写入YAML文件
    output_file = "/mnt/jfzn/msj/flash-linear-attention/legacy/training/configs/deepspeed_sencore.yaml"
    with open(output_file, "w") as f:
        yaml.dump(config, f, default_flow_style=False)
    
    


    print(f"\nConfiguration saved to: {output_file}")
    
    # 同时输出文件内容供检查
    print("\nFile content:")
    with open(output_file, "r") as f:
        print(f.read())
    
    # 读取原始 SSH 配置

    input_file = '/mnt/jfzn/msj/flash-linear-attention/legacy/training/ssh_config/config'   # 你的文件名
    output_file = '/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt'    # 输出 hostfile

    hostnames = []

    with open(input_file, "r") as f:
        for line in f:
            line = line.strip()
            if line.startswith("Hostname"):
                # 提取 Hostname 后面的内容
                hostname = line.split(None, 1)[1]
                hostnames.append(hostname)

    # 写入到 hostfile,每行一个 hostname
    with open(output_file, "w") as f:
        for host in hostnames:
            f.write(host+ " slots=8\n")

    print(f"提取了 {len(hostnames)} 个 hostname,已写入 {output_file}")


    # output_path='/mnt/jfzn/msj/flash-linear-attention/legacy/training/hostfile.txt'
    # with open(output_path, "w") as f:
    #     addr = master_addr
    #     f.write(f"{addr} slots={8}\n")
    #     q = addr.split('-')
    #     q[2] = 'worker'
    #     addr = '-'.join(q)
    #     f.write(f"{addr} slots={8}\n")
    # print(f"hostfile 已生成: {output_path}")
    return 0

if __name__ == "__main__":
    sys.exit(main())