手动安装 warewulf+slurm 环境
安装部署
- CentOS7.7 2C2G20G x 1 SMS
- CentOS7.7 2C2G20G x 2 Compute,调整引导顺序,第一为 PXE
网络
- External x 1 连接公网
- Private x 1 内网
部署 sms 节点
$ cp /opt/ohpc/pub/doc/recipes/centos7/input.local input.local
$ cp -p /opt/ohpc/pub/doc/recipes/centos7/x86_64/warewulf/pbspro/recipe.sh .
$ cat input.local # 部分值如下
sms_ip="${sms_ip:-172.20.0.80}"
sms_eth_internal="${sms_eth_internal:-ens33}"
internal_netmask="${internal_netmask:-255.255.255.0}"
eth_provision="${eth_provision:-ens33}"
slurm_node_config="${slurm_node_config:-c[1-1] Sockets=2 CoresPerSocket=12 ThreadsPerCore=2}"
num_computes="${num_computes:-1}"
...
$ export OHPC_INPUT_LOCAL=./input.local
$ ./recipe.sh
测试
# 测试节点是否正常
pdsh -w c[1-2] hostname
# 用PBS队列交互式执行,在 2 个计算节点上,均执行4次程序
$ su - test
$ mpicc -O3 /opt/ohpc/pub/examples/mpi/hello.c
$ ls
a.out
$ qsub -I -l select=2:mpiprocs=4
$ prun ./a.out
$
配置 hostname:sms/c1/c2/c3
配置 sms 到 c[1-3]免密登录
yum install -y http://build.openhpc.community/OpenHPC:/1.3/CentOS_7/x86_64/ohpc-release-1.3-1.el7.x86_64.rpm
yum -y install ohpc-base docs-ohpc
pdsh -w c1[1-3] yum install -y http://build.openhpc.community/OpenHPC:/1.3/CentOS_7/x86_64/ohpc-release-1.3-1.el7.x86_64.rpm
pdsh -w sms,c1[1-3] yum install -y tcping wget vim iftop htop
pdsh -w sms,c1[1-3] systemctl disable firewalld
pdsh -w sms,c1[1-3] systemctl stop firewalld
pdsh -w sms,c1[1-3] yum install -y ntp
pdsh -w sms,c1[1-3] systemctl enable ntpd.service
pdsh -w sms,c1[1-3] echo "server ${ntp_server}" >> /etc/ntp.conf
pdsh -w sms,c1[1-3] systemctl restart ntpd
pdsh -w sms,c1[1-3] date
cp /opt/ohpc/pub/doc/recipes/centos7/input.local input.local
+sms_ip="${sms_ip:-172.18.1.10}"
+sms_eth_internal="${sms_eth_internal:-eth0}"
+internal_netmask="${internal_netmask:-255.255.255.0}"
+c_name[0]=c11
+c_name[1]=c12
+c_name[2]=c13
+c_name[3]=c14
+c_ip[0]=172.18.1.11
+c_ip[1]=172.18.1.12
+c_ip[2]=172.18.1.13
+c_mac[0]=fa:16:3e:62:20:f6
+c_mac[1]=fa:16:3e:f7:d3:46
+c_mac[2]=fa:16:3e:81:1d:b7
source input.local
# Install base meta-packages
[sms]# yum -y install ohpc-base
[sms]# yum -y install ohpc-warewulf
# 配置 warewulf
# Configure Warewulf provisioning to use desired internal interface
[sms]# perl -pi -e "s/device = eth1/device = ${sms_eth_internal}/" /etc/warewulf/provision.conf
# Enable tftp service for compute node image distribution
[sms]# perl -pi -e "s/^\s+disable\s+= yes/ disable = no/" /etc/xinetd.d/tftp
# Enable internal interface for provisioning
# Install slurm server meta-package
[sms]# yum -y install ohpc-slurm-server
# Identify resource manager hostname on master host
[sms]# perl -pi -e "s/ControlMachine=\S+/ControlMachine=${sms_name}/" /etc/slurm/slurm.conf
[sms]# perl -pi -e "s/^NodeName=.*/NodeName=${slurm_node_config} State=UNKNOWN/" /etc/slurm/slurm.conf
[sms]# perl -pi -e "s/ Nodes=c\S+ / Nodes=c[1-$num_computes] /" /etc/slurm/slurm.conf
[sms]# ifconfig ${sms_eth_internal} ${sms_ip} netmask ${internal_netmask} up
# Restart/enable relevant services to support provisioning
[sms]# systemctl enable xinetd
[sms]# systemctl restart xinetd
[sms]# systemctl enable mariadb.service
[sms]# systemctl restart mariadb
[sms]# systemctl enable httpd.service
[sms]# systemctl restart httpd
[sms]# systemctl enable dhcpd.service
# Restart dhcp / update PXE
[sms]# systemctl restart dhcpd
[sms]# wwsh pxe update -v
yum -y install opensm
systemctl enable opensm
systemctl start opensm
# Override default OS repository (optional) - set YUM_MIRROR variable to desired repo location
[sms]# export YUM_MIRROR=${BOS_MIRROR}
# Define chroot location
[sms]# export CHROOT=/opt/ohpc/admin/images/centos7.7
# Build initial chroot image
[sms]# wwmkchroot centos-7 $CHROOT
# Install compute node base meta-package
[sms]# yum -y --installroot=$CHROOT install ohpc-base-compute
[sms]# cp -p /etc/resolv.conf $CHROOT/etc/resolv.conf
# Add Slurm client support meta-package
[sms]# yum -y --installroot=$CHROOT install ohpc-slurm-client
# Add Network Time Protocol (NTP) support
[sms]# yum -y --installroot=$CHROOT install ntp
# Add kernel drivers
[sms]# yum -y --installroot=$CHROOT install kernel
# Include modules user environment
[sms]# yum -y --installroot=$CHROOT install lmod-ohpc
- Customize system configuration
# Initialize warewulf database and ssh_keys
[sms]# wwinit database
[sms]# wwinit ssh_keys
# Add NFS client mounts of /home and /opt/ohpc/pub to base image
[sms]# echo "${sms_ip}:/home /home nfs nfsvers=3,nodev,nosuid 0 0" >> $CHROOT/etc/fstab
[sms]# echo "${sms_ip}:/opt/ohpc/pub /opt/ohpc/pub nfs nfsvers=3,nodev 0 0" >> $CHROOT/etc/fstab
# Export /home and OpenHPC public packages from master server
[sms]# echo "/home *(rw,no_subtree_check,fsid=10,no_root_squash)" >> /etc/exports
[sms]# echo "/opt/ohpc/pub *(ro,no_subtree_check,fsid=11)" >> /etc/exports
[sms]# exportfs -a
[sms]# systemctl restart nfs-server
[sms]# systemctl enable nfs-server
# Enable NTP time service on computes and identify master host as local NTP server
[sms]# chroot $CHROOT systemctl enable ntpd
[sms]# echo "server ${sms_ip}" >> $CHROOT/etc/ntp.conf
- Increase locked memory limits
# Update memlock settings on master
[sms]# perl -pi -e 's/# End of file/\* soft memlock unlimited\n$&/s' /etc/security/limits.conf
[sms]# perl -pi -e 's/# End of file/\* hard memlock unlimited\n$&/s' /etc/security/limits.conf
# Update memlock settings within compute image
[sms]# perl -pi -e 's/# End of file/\* soft memlock unlimited\n$&/s' $CHROOT/etc/security/limits.conf
[sms]# perl -pi -e 's/# End of file/\* hard memlock unlimited\n$&/s' $CHROOT/etc/security/limits.conf
# Enable slurm pam module
echo "account required pam_slurm.so" >> $CHROOT/etc/pam.d/sshd
# Optionally, enable nhc and configure
yum -y install nhc-ohpc
yum -y --installroot=$CHROOT install nhc-ohpc
echo "HealthCheckProgram=/usr/sbin/nhc" >> /etc/slurm/slurm.conf
echo "HealthCheckInterval=300" >> /etc/slurm/slurm.conf # execute every five minutes
# ----------------------------
# Import files (Section 3.8.5)
# ----------------------------
wwsh file import /etc/passwd
wwsh file import /etc/group
wwsh file import /etc/shadow
wwsh file import /etc/slurm/slurm.conf
wwsh file import /etc/munge/munge.key
# --------------------------------------
# Assemble bootstrap image (Section 3.9)
# --------------------------------------
export WW_CONF=/etc/warewulf/bootstrap.conf
echo "drivers += updates/kernel/" >> $WW_CONF
echo "drivers += overlay" >> $WW_CONF
wwbootstrap `uname -r`
# Assemble VNFS
wwvnfs --chroot $CHROOT
# Add hosts to cluster
echo "GATEWAYDEV=${eth_provision}" > /tmp/network.$$
wwsh -y file import /tmp/network.$$ --name network
wwsh -y file set network --path /etc/sysconfig/network --mode=0644 --uid=0
for ((i=0; i<$num_computes; i++)) ; do
wwsh -y node new ${c_name[i]} --ipaddr=${c_ip[i]} --hwaddr=${c_mac[i]} -D ${eth_provision}
done
# Add hosts to cluster (Cont.)
wwsh -y provision set "${compute_regex}" --vnfs=centos7.7 --bootstrap=`uname -r` --files=dynamic_hosts,passwd,group,shadow,slurm.conf,munge.key,network
systemctl restart dhcpd
wwsh pxe update
# Optionally, enable user namespaces
export kargs="${kargs} namespace.unpriv_enable=1"
echo "user.max_user_namespaces=15076" >> $CHROOT/etc/sysctl.conf
wwvnfs --chroot $CHROOT
计算节点手动部署
[sms]# echo 172.18.1.10 sms >> /etc/hosts
[sms]# scp /etc/hosts c11:/etc/hosts
[sms]# scp input.local c11:/root/
[c1]# . input.local
# Install compute node base meta-package
[c1]# yum -y install ohpc-base-compute
# 配置 cp -p /etc/resolv.conf
# Add Slurm client support meta-package
[c1]# yum -y install ohpc-slurm-client
# Add Network Time Protocol (NTP) support
[c1]# yum -y install ntp
# Enable NTP time service on computes and identify master host as local NTP server
[c1]# systemctl enable ntpd
[c1]# echo "server ${sms_ip}" >> /etc/ntp.conf
# Add kernel drivers
[c1]# yum -y install kernel
# Include modules user environment
[c1]# yum -y install lmod-ohpc
[c1]# perl -pi -e "s/^NodeName=(\S+)/NodeName=${compute_prefix}[1-${num_computes}]/" /etc/slurm/slurm.conf
[c1]# perl -pi -e "s/^NodeName=.*/NodeName=${slurm_node_config} State=UNKNOWN/" /etc/slurm/slurm.conf
[c1]# perl -pi -e "s/^PartitionName=normal Nodes=(\S+)/PartitionName=normal Nodes=${compute_prefix}[1-${num_computes}]/" /etc/slurm/slurm.conf
[c1]# perl -pi -e "s/ControlMachine=\S+/ControlMachine=${sms_name}/" /etc/slurm/slurm.conf
- Customize system configuration
# Add NFS client mounts of /home and /opt/ohpc/pub to base image
[c1]# echo "${sms_ip}:/home /home nfs nfsvers=3,nodev,nosuid 0 0" >> /etc/fstab
[c1]# echo "${sms_ip}:/opt/ohpc/pub /opt/ohpc/pub nfs nfsvers=3,nodev 0 0" >> /etc/fstab
- Increase locked memory limits
# Update memlock settings within compute image
[c1]# perl -pi -e 's/# End of file/\* soft memlock unlimited\n$&/s' /etc/security/limits.conf
[c1]# perl -pi -e 's/# End of file/\* hard memlock unlimited\n$&/s' /etc/security/limits.conf
# Enable slurm pam module
echo "account required pam_slurm.so" >> /etc/pam.d/sshd
# Optionally, enable nhc and configure
yum -y install nhc-ohpc
echo "HealthCheckProgram=/usr/sbin/nhc" >> /etc/slurm/slurm.conf
echo "HealthCheckInterval=300" >> /etc/slurm/slurm.conf # execute every five minutes
[c11]# yum install -y munge
[c11]# systemctl enable munge
[sms]# scp /etc/munge/munge.key c11:/etc/munge/munge.key
wwsh file import /etc/passwd
wwsh file import /etc/group
wwsh file import /etc/shadow
wwsh file import /etc/slurm/slurm.conf
wwsh file import /etc/munge/munge.key
# ------------------------------------
# Resource Manager Startup (Section 5)
# ------------------------------------
systemctl enable munge
systemctl enable slurmctld
systemctl start munge
systemctl start slurmctld
pdsh -w $compute_prefix[1-4] systemctl start slurmd
# Optionally, generate nhc config
pdsh -w c11 "/usr/sbin/nhc-genconf -H '*' -c -" | dshbak -c
useradd -m test
wwsh file resync passwd shadow group
sleep 2
pdsh -w $compute_prefix[1-4] /warewulf/bin/wwgetfiles
[root@c11 ~]# systemctl status slurmd.service
● slurmd.service - Slurm node daemon
Loaded: loaded (/usr/lib/systemd/system/slurmd.service; disabled; vendor preset: disabled)
Active: failed (Result: exit-code) since Thu 2021-07-08 18:10:03 CST; 6s ago
Process: 2258 ExecStart=/usr/sbin/slurmd $SLURMD_OPTIONS (code=exited, status=1/FAILURE)
Jul 08 18:10:03 c11 systemd[1]: Starting Slurm node daemon...
Jul 08 18:10:03 c11 slurmd[2258]: fatal: Unable to determine this slurmd's NodeName
Jul 08 18:10:03 c11 systemd[1]: slurmd.service: control process exited, code=exited status=1
Jul 08 18:10:03 c11 systemd[1]: Failed to start Slurm node daemon.
Jul 08 18:10:03 c11 systemd[1]: Unit slurmd.service entered failed state.
Jul 08 18:10:03 c11 systemd[1]: slurmd.service failed.
手动启动服务调试:
[c11]# slurmd -D -N $(hostname -s)
slurmd: error: _find_node_record(753): lookup failure for c11
slurmd: fatal: ROUTE -- c11 not found in node_record_table
[sms]# sinfo -n c11
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
normal* up 1-00:00:00 0 n/a
发现:/etc/slurm/slurm.conf 中名字是 c[1-3] 而实际的 node 名字为 c1[1-3]
NodeName=c1[1-3] Sockets=32 CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN
PartitionName=normal Nodes=c1[1-3] Default=YES MaxTime=24:00:00 State=UP
sms: systemctl restart slurmctld.service
c11: systemctl start slurmd.service
# yum install nhc-ohpc-1.4.2-5.1.noarch
...
Running transaction
Installing : nhc-ohpc-1.4.2-5.1.noarch 1/1
Error unpacking rpm package nhc-ohpc-1.4.2-5.1.noarch
error: unpacking of archive failed on file /opt/ohpc/pub/doc/contrib/nhc-ohpc-1.4.2/COPYING;60e79fca: cpio: open
Verifying : nhc-ohpc-1.4.2-5.1.noarch 1/1
Failed:
nhc-ohpc.noarch 0:1.4.2-5.1
mount /opt/ohpc/pub 导致的,先不要 mount,然后就可以正常安装了