Post

slurm安装

slurm安装

提前并运行munge

1
2
[root@offline-almalinux8-193-computing slurm-23.11.0]# ./configure --prefix=/tools/OSS/Slurm/23.11.0 --with-munge
[root@offline-almalinux8-193-computing slurm-23.11.0]# make && make install

配置

Slurm Configuration Tool定制,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=ic-design-cluster-001
SlurmctldHost=offline-almalinux8-193-computing.icinfra.cn
SlurmctldHost=offline-almalinux8-194-computing.icinfra.cn
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
#MpiDefault=
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/cgroup
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurmctld
#SwitchType=
#TaskEpilog=
TaskPlugin=task/affinity,task/cgroup
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
#AccountingStorageType=
#AccountingStorageUser=
#AccountingStoreFlags=
#JobCompHost=
#JobCompLoc=
#JobCompParams=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=
JobAcctGatherFrequency=30
#JobAcctGatherType=
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=offline-almalinux8-19[5-7]-computing.icinfra.cn State=UNKNOWN
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP

运行Controller

前台运行,看是否有报错

1
2
[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld -D
slurmctld: fatal: mkdir(/var/spool/slurmctld): Permission denied

提示无权限,这里创建并修改owner,

1
[cloud-user@offline-almalinux8-193-computing ~]$ sudo mkdir /var/spool/slurmctld && sudo chown slurm /var/spool/slurmctld

前台运行

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld -D
slurmctld: error: Configured MailProg is invalid
slurmctld: slurmctld version 23.11.0 started on cluster ic-design-cluster-001
slurmctld: error: _shutdown_bu_thread:send/recv offline-almalinux8-194-computing.icinfra.cn: Connection refused
slurmctld: No memory enforcing mechanism configured.
slurmctld: error: Could not open node state file /var/spool/slurmctld/node_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Information may be lost!
slurmctld: No node state file (/var/spool/slurmctld/node_state.old) to recover
slurmctld: error: Could not open job state file /var/spool/slurmctld/job_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Jobs may be lost!
slurmctld: No job state file (/var/spool/slurmctld/job_state.old) to recover
slurmctld: select/cons_tres: select_p_node_init: select/cons_tres SelectTypeParameters not specified, using default value: CR_Core_Memory
slurmctld: select/cons_tres: part_data_create_array: select/cons_tres: preparing for 1 partitions
slurmctld: error: Could not open reservation state file /var/spool/slurmctld/resv_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Reservations may be lost
slurmctld: No reservation state file (/var/spool/slurmctld/resv_state.old) to recover
slurmctld: error: Could not open trigger state file /var/spool/slurmctld/trigger_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Triggers may be lost!
slurmctld: No trigger state file (/var/spool/slurmctld/trigger_state.old) to recover
slurmctld: Reinitializing job accounting state
slurmctld: select/cons_tres: select_p_reconfigure: select/cons_tres: reconfigure
slurmctld: select/cons_tres: part_data_create_array: select/cons_tres: preparing for 1 partitions
slurmctld: Running as primary controller

后台运行

1
[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld

运行Database

运行Compute Nodes

1
[cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld

使用

查看任务队列

1
2
3
[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/squeue 
             JOBID PARTITION     NAME     USER ST       TIME  NODES NODELIST(REASON)
                 2     debug     wrap cloud-us  R    1:27:49      1 offline-almalinux8-195-computing.icinfra.cn

查看Slurm集群信息

1
2
3
4
5
6
7
8
9
10
[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sinfo 
PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
debug*       up   infinite      1  alloc offline-almalinux8-195-computing.icinfra.cn
debug*       up   infinite      2   idle offline-almalinux8-196-computing.icinfra.cn,offline-almalinux8-197-computing.icinfra.cn
[cloud-user@offline-almalinux8-194-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sinfo -Nl
Tue Nov 28 23:53:20 2023
NODELIST                                     NODES PARTITION       STATE CPUS    S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON              
offline-almalinux8-195-computing.icinfra.cn      1    debug*   allocated 1       1:1:1      1        0      1   (null) none                
offline-almalinux8-196-computing.icinfra.cn      1    debug*   allocated 1       1:1:1      1        0      1   (null) none                
offline-almalinux8-197-computing.icinfra.cn      1    debug*        idle 1       1:1:1      1        0      1   (null) none   

提交作业

1
[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sbatch --wrap="sleep 36000"

查看作业

1
2
3
[cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sstat 2
JobID         MaxVMSize  MaxVMSizeNode  MaxVMSizeTask  AveVMSize     MaxRSS MaxRSSNode MaxRSSTask     AveRSS MaxPages MaxPagesNode   MaxPagesTask   AvePages     MinCPU MinCPUNode MinCPUTask     AveCPU   NTasks AveCPUFreq ReqCPUFreqMin ReqCPUFreqMax ReqCPUFreqGov ConsumedEnergy  MaxDiskRead MaxDiskReadNode MaxDiskReadTask  AveDiskRead MaxDiskWrite MaxDiskWriteNode MaxDiskWriteTask AveDiskWrite TRESUsageInAve TRESUsageInMax TRESUsageInMaxNode TRESUsageInMaxTask TRESUsageInMin TRESUsageInMinNode TRESUsageInMinTask TRESUsageInTot TRESUsageOutAve TRESUsageOutMax TRESUsageOutMaxNode TRESUsageOutMaxTask TRESUsageOutMin TRESUsageOutMinNode TRESUsageOutMinTask TRESUsageOutTot 
------------ ---------- -------------- -------------- ---------- ---------- ---------- ---------- ---------- -------- ------------ -------------- ---------- ---------- ---------- ---------- ---------- -------- ---------- ------------- ------------- ------------- -------------- ------------ --------------- --------------- ------------ ------------ ---------------- ---------------- ------------ -------------- -------------- ------------------ ------------------ -------------- ------------------ ------------------ -------------- --------------- --------------- ------------------- ------------------- --------------- ------------------- ------------------- ---------------

参考资料

https://slurm.schedmd.com/quickstart_admin.html#build_install #slurm的构建与安装

https://www.icinfra.cn/blog/2023/setting-up-munge-on-almalinux8/ #munge的安装

This post is licensed under CC BY 4.0 by the author.