在RHEL6.5中部署Slurm
服务器取名为 xxxn 例如 linux0, linux1, ... ,linuxN, 即前面名称相同后接连续整数标识对应节点.
这么做的原因是后续操作slurm时可以用 linux[n1-N2] 的方法批量操作或查询节点
规划:
slurm可以最多选取两个节点分别作为主备控制节点.
linux[0-199] 作为计算节点. 同时linux[0-1]也分别作为主备控制节点.
步骤:
- 更改所有服务器的DNS或/etc/hosts文件. 建立计算机名称(linux[0-199])和其IP解析关系
同步munge.key
同步slurm.conf
cp /home/hosts /etc/hosts
yum install munge
create-munge-key
cp /home/munge.key /etc/munge/munge.key
chown munge:munge /etc/munge/munge.key
service munge start
chkconfig munge on
[root@linux2 ldapuser1]# qperf 192.168.5.7 tcp_bw tcp_lat conf
tcp_bw:
bw = 1.18 GB/sec
tcp_lat:
latency = 12.2 us
conf:
loc_node = linux2
loc_cpu = 32 Cores: Intel Xeon E5-2667 v3 @ 3.20GHz
loc_os = Linux 2.6.32-431.el6.x86_64
loc_qperf = 0.4.9
rem_node = linux5
rem_cpu = 32 Cores: Intel Xeon E5-2667 v3 @ 3.20GHz
rem_os = Linux 2.6.32-431.el6.x86_64
rem_qperf = 0.4.9
[root@linux2 ldapuser1]#
[root@linux2 ldapuser1]# qperf 192.168.5.12 tcp_bw tcp_lat conf
tcp_bw:
bw = 118 MB/sec
tcp_lat:
latency = 28 us
conf:
loc_node = linux2
loc_cpu = 32 Cores: Intel Xeon E5-2667 v3 @ 3.20GHz
loc_os = Linux 2.6.32-431.el6.x86_64
loc_qperf = 0.4.9
rem_node = localhost.localdomain
rem_cpu = 32 Cores: Mixed CPUs
rem_os = Linux 3.10.0-327.el7.x86_64
rem_qperf = 0.4.9
[root@linux2 ldapuser1]#
SlurmdUser=root
rpm -i /home/slurm-*.rpm
cp /home/slurm.conf /etc/slurm/slurm.conf
service slurm start
chkconfig slurm on
service iptables stop
chkconfig iptables off
sinfo
scontrol update node=linux[0-9] state=idle
scontrol show node linux0
srun -N3 -l hostname
mpich2 openmpi 时会动态临时端口.由于无法提前得知端口,这里要关闭iptables.后期可考虑动态允许通过防火墙
参考: http://slurm.schedmd.com/