slurm安装
提前并运行munge
[root@offline-almalinux8-193-computing slurm-23.11.0]# ./configure --prefix=/tools/OSS/Slurm/23.11.0 --with-munge
[root@offline-almalinux8-193-computing slurm-23.11.0]# make && make install
配置
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=ic-design-cluster-001
SlurmctldHost=offline-almalinux8-193-computing.icinfra.cn
SlurmctldHost=offline-almalinux8-194-computing.icinfra.cn
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
#MpiDefault=
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/cgroup
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurmctld
#SwitchType=
#TaskEpilog=
TaskPlugin=task/affinity,task/cgroup
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
#AccountingStorageType=
#AccountingStorageUser=
#AccountingStoreFlags=
#JobCompHost=
#JobCompLoc=
#JobCompParams=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=
JobAcctGatherFrequency=30
#JobAcctGatherType=
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=offline-almalinux8-19[5-7]-computing.icinfra.cn State=UNKNOWN
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
运行Controller
前台运行,看是否有报错
[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld -D
slurmctld: fatal: mkdir(/var/spool/slurmctld): Permission denied
提示无权限,这里创建并修改owner,
[cloud-user@offline-almalinux8-193-computing ~]$ sudo mkdir /var/spool/slurmctld && sudo chown slurm /var/spool/slurmctld
前台运行
[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld -D
slurmctld: error: Configured MailProg is invalid
slurmctld: slurmctld version 23.11.0 started on cluster ic-design-cluster-001
slurmctld: error: _shutdown_bu_thread:send/recv offline-almalinux8-194-computing.icinfra.cn: Connection refused
slurmctld: No memory enforcing mechanism configured.
slurmctld: error: Could not open node state file /var/spool/slurmctld/node_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Information may be lost!
slurmctld: No node state file (/var/spool/slurmctld/node_state.old) to recover
slurmctld: error: Could not open job state file /var/spool/slurmctld/job_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Jobs may be lost!
slurmctld: No job state file (/var/spool/slurmctld/job_state.old) to recover
slurmctld: select/cons_tres: select_p_node_init: select/cons_tres SelectTypeParameters not specified, using default value: CR_Core_Memory
slurmctld: select/cons_tres: part_data_create_array: select/cons_tres: preparing for 1 partitions
slurmctld: error: Could not open reservation state file /var/spool/slurmctld/resv_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Reservations may be lost
slurmctld: No reservation state file (/var/spool/slurmctld/resv_state.old) to recover
slurmctld: error: Could not open trigger state file /var/spool/slurmctld/trigger_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Triggers may be lost!
slurmctld: No trigger state file (/var/spool/slurmctld/trigger_state.old) to recover
slurmctld: Reinitializing job accounting state
slurmctld: select/cons_tres: select_p_reconfigure: select/cons_tres: reconfigure
slurmctld: select/cons_tres: part_data_create_array: select/cons_tres: preparing for 1 partitions
slurmctld: Running as primary controller
后台运行
[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld
运行Database
略
运行Compute Nodes
[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld
使用
查看任务队列
[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
2 debug wrap cloud-us R 1:27:49 1 offline-almalinux8-195-computing.icinfra.cn
查看Slurm集群信息
[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
debug* up infinite 1 alloc offline-almalinux8-195-computing.icinfra.cn
debug* up infinite 2 idle offline-almalinux8-196-computing.icinfra.cn,offline-almalinux8-197-computing.icinfra.cn
[cloud-user@offline-almalinux8-194-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sinfo -Nl
Tue Nov 28 23:53:20 2023
NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON
offline-almalinux8-195-computing.icinfra.cn 1 debug* allocated 1 1:1:1 1 0 1 (null) none
offline-almalinux8-196-computing.icinfra.cn 1 debug* allocated 1 1:1:1 1 0 1 (null) none
offline-almalinux8-197-computing.icinfra.cn 1 debug* idle 1 1:1:1 1 0 1 (null) none
提交作业
[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sbatch --wrap="sleep 36000"
查看作业
[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sstat 2
JobID MaxVMSize MaxVMSizeNode MaxVMSizeTask AveVMSize MaxRSS MaxRSSNode MaxRSSTask AveRSS MaxPages MaxPagesNode MaxPagesTask AvePages MinCPU MinCPUNode MinCPUTask AveCPU NTasks AveCPUFreq ReqCPUFreqMin ReqCPUFreqMax ReqCPUFreqGov ConsumedEnergy MaxDiskRead MaxDiskReadNode MaxDiskReadTask AveDiskRead MaxDiskWrite MaxDiskWriteNode MaxDiskWriteTask AveDiskWrite TRESUsageInAve TRESUsageInMax TRESUsageInMaxNode TRESUsageInMaxTask TRESUsageInMin TRESUsageInMinNode TRESUsageInMinTask TRESUsageInTot TRESUsageOutAve TRESUsageOutMax TRESUsageOutMaxNode TRESUsageOutMaxTask TRESUsageOutMin TRESUsageOutMinNode TRESUsageOutMinTask TRESUsageOutTot
------------ ---------- -------------- -------------- ---------- ---------- ---------- ---------- ---------- -------- ------------ -------------- ---------- ---------- ---------- ---------- ---------- -------- ---------- ------------- ------------- ------------- -------------- ------------ --------------- --------------- ------------ ------------ ---------------- ---------------- ------------ -------------- -------------- ------------------ ------------------ -------------- ------------------ ------------------ -------------- --------------- --------------- ------------------- ------------------- --------------- ------------------- ------------------- ---------------
参考资料
https://slurm.schedmd.com/quickstart_admin.html#build_install #slurm的构建与安装
https://www.icinfra.cn/blog/2023/setting-up-munge-on-almalinux8/ #munge的安装
Enjoy Reading This Article?
Here are some more articles you might like to read next: