服务器取名为 xxxn 例如 linux0, linux1, ... ,linuxN, 即前面名称相同后接连续整数标识对应节点.
这么做的原因是后续操作slurm时可以用 linux[n1-N2] 的方法批量操作或查询节点

规划:
slurm可以最多选取两个节点分别作为主备控制节点.
linux[0-199] 作为计算节点. 同时linux[0-1]也分别作为主备控制节点.

步骤:

  1. 更改所有服务器的DNS或/etc/hosts文件. 建立计算机名称(linux[0-199])和其IP解析关系

同步munge.key
同步slurm.conf

cp /home/hosts /etc/hosts

yum install munge
create-munge-key

cp /home/munge.key /etc/munge/munge.key
chown munge:munge /etc/munge/munge.key
service munge start
chkconfig munge on
[root@linux2 ldapuser1]# qperf 192.168.5.7 tcp_bw tcp_lat conf
tcp_bw:
    bw  =  1.18 GB/sec
tcp_lat:
    latency  =  12.2 us
conf:
    loc_node   =  linux2
    loc_cpu    =  32 Cores: Intel Xeon E5-2667 v3 @ 3.20GHz
    loc_os     =  Linux 2.6.32-431.el6.x86_64
    loc_qperf  =  0.4.9
    rem_node   =  linux5
    rem_cpu    =  32 Cores: Intel Xeon E5-2667 v3 @ 3.20GHz
    rem_os     =  Linux 2.6.32-431.el6.x86_64
    rem_qperf  =  0.4.9
[root@linux2 ldapuser1]#

[root@linux2 ldapuser1]# qperf 192.168.5.12 tcp_bw tcp_lat conf
tcp_bw:
    bw  =  118 MB/sec
tcp_lat:
    latency  =  28 us
conf:
    loc_node   =  linux2
    loc_cpu    =  32 Cores: Intel Xeon E5-2667 v3 @ 3.20GHz
    loc_os     =  Linux 2.6.32-431.el6.x86_64
    loc_qperf  =  0.4.9
    rem_node   =  localhost.localdomain
    rem_cpu    =  32 Cores: Mixed CPUs
    rem_os     =  Linux 3.10.0-327.el7.x86_64
    rem_qperf  =  0.4.9
[root@linux2 ldapuser1]#
SlurmdUser=root

rpm -i /home/slurm-*.rpm
cp /home/slurm.conf /etc/slurm/slurm.conf
service slurm start
chkconfig slurm on

service iptables stop
chkconfig iptables off

sinfo

scontrol update node=linux[0-9] state=idle

scontrol show node linux0
srun -N3 -l hostname

mpich2 openmpi 时会动态临时端口.由于无法提前得知端口,这里要关闭iptables.后期可考虑动态允许通过防火墙
参考: http://slurm.schedmd.com/

Tag:none

Add a new comment.