提前并运行munge

[root@offline-almalinux8-193-computing slurm-23.11.0]# ./configure --prefix=/tools/OSS/Slurm/23.11.0 --with-munge
[root@offline-almalinux8-193-computing slurm-23.11.0]# make && make install

配置

Slurm Configuration Tool定制,

# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=ic-design-cluster-001
SlurmctldHost=offline-almalinux8-193-computing.icinfra.cn
SlurmctldHost=offline-almalinux8-194-computing.icinfra.cn
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
#MpiDefault=
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/cgroup
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurmctld
#SwitchType=
#TaskEpilog=
TaskPlugin=task/affinity,task/cgroup
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
#AccountingStorageType=
#AccountingStorageUser=
#AccountingStoreFlags=
#JobCompHost=
#JobCompLoc=
#JobCompParams=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=
JobAcctGatherFrequency=30
#JobAcctGatherType=
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=offline-almalinux8-19[5-7]-computing.icinfra.cn State=UNKNOWN
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP

运行Controller

前台运行,看是否有报错

[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld -D
slurmctld: fatal: mkdir(/var/spool/slurmctld): Permission denied

提示无权限,这里创建并修改owner,

[cloud-user@offline-almalinux8-193-computing ~]$ sudo mkdir /var/spool/slurmctld && sudo chown slurm /var/spool/slurmctld

前台运行

[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld -D
slurmctld: error: Configured MailProg is invalid
slurmctld: slurmctld version 23.11.0 started on cluster ic-design-cluster-001
slurmctld: error: _shutdown_bu_thread:send/recv offline-almalinux8-194-computing.icinfra.cn: Connection refused
slurmctld: No memory enforcing mechanism configured.
slurmctld: error: Could not open node state file /var/spool/slurmctld/node_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Information may be lost!
slurmctld: No node state file (/var/spool/slurmctld/node_state.old) to recover
slurmctld: error: Could not open job state file /var/spool/slurmctld/job_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Jobs may be lost!
slurmctld: No job state file (/var/spool/slurmctld/job_state.old) to recover
slurmctld: select/cons_tres: select_p_node_init: select/cons_tres SelectTypeParameters not specified, using default value: CR_Core_Memory
slurmctld: select/cons_tres: part_data_create_array: select/cons_tres: preparing for 1 partitions
slurmctld: error: Could not open reservation state file /var/spool/slurmctld/resv_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Reservations may be lost
slurmctld: No reservation state file (/var/spool/slurmctld/resv_state.old) to recover
slurmctld: error: Could not open trigger state file /var/spool/slurmctld/trigger_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Triggers may be lost!
slurmctld: No trigger state file (/var/spool/slurmctld/trigger_state.old) to recover
slurmctld: Reinitializing job accounting state
slurmctld: select/cons_tres: select_p_reconfigure: select/cons_tres: reconfigure
slurmctld: select/cons_tres: part_data_create_array: select/cons_tres: preparing for 1 partitions
slurmctld: Running as primary controller

后台运行

[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld

运行Database

运行Compute Nodes

[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld

使用

查看任务队列

[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/squeue 
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
                 2     debug     wrap cloud-us  R    1:27:49      1 offline-almalinux8-195-computing.icinfra.cn

查看Slurm集群信息

[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sinfo 
PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
debug*       up   infinite      1  alloc offline-almalinux8-195-computing.icinfra.cn
debug*       up   infinite      2   idle offline-almalinux8-196-computing.icinfra.cn,offline-almalinux8-197-computing.icinfra.cn
[cloud-user@offline-almalinux8-194-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sinfo -Nl
Tue Nov 28 23:53:20 2023
NODELIST                                     NODES PARTITION       STATE CPUS    S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON              
offline-almalinux8-195-computing.icinfra.cn      1    debug*   allocated 1       1:1:1      1        0      1   (null) none                
offline-almalinux8-196-computing.icinfra.cn      1    debug*   allocated 1       1:1:1      1        0      1   (null) none                
offline-almalinux8-197-computing.icinfra.cn      1    debug*        idle 1       1:1:1      1        0      1   (null) none   

提交作业

[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sbatch --wrap="sleep 36000"

查看作业

[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sstat 2
JobID         MaxVMSize  MaxVMSizeNode  MaxVMSizeTask  AveVMSize     MaxRSS MaxRSSNode MaxRSSTask     AveRSS MaxPages MaxPagesNode   MaxPagesTask   AvePages     MinCPU MinCPUNode MinCPUTask     AveCPU   NTasks AveCPUFreq ReqCPUFreqMin ReqCPUFreqMax ReqCPUFreqGov ConsumedEnergy  MaxDiskRead MaxDiskReadNode MaxDiskReadTask  AveDiskRead MaxDiskWrite MaxDiskWriteNode MaxDiskWriteTask AveDiskWrite TRESUsageInAve TRESUsageInMax TRESUsageInMaxNode TRESUsageInMaxTask TRESUsageInMin TRESUsageInMinNode TRESUsageInMinTask TRESUsageInTot TRESUsageOutAve TRESUsageOutMax TRESUsageOutMaxNode TRESUsageOutMaxTask TRESUsageOutMin TRESUsageOutMinNode TRESUsageOutMinTask TRESUsageOutTot 
------------ ---------- -------------- -------------- ---------- ---------- ---------- ---------- ---------- -------- ------------ -------------- ---------- ---------- ---------- ---------- ---------- -------- ---------- ------------- ------------- ------------- -------------- ------------ --------------- --------------- ------------ ------------ ---------------- ---------------- ------------ -------------- -------------- ------------------ ------------------ -------------- ------------------ ------------------ -------------- --------------- --------------- ------------------- ------------------- --------------- ------------------- ------------------- ---------------

参考资料

https://slurm.schedmd.com/quickstart_admin.html#build_install #slurm的构建与安装

https://www.icinfra.cn/blog/2023/setting-up-munge-on-almalinux8/ #munge的安装