提前并运行munge
1
2
| [root@offline-almalinux8-193-computing slurm-23.11.0]# ./configure --prefix=/tools/OSS/Slurm/23.11.0 --with-munge
[root@offline-almalinux8-193-computing slurm-23.11.0]# make && make install
|
配置
到Slurm Configuration Tool定制,
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
| # slurm.conf file generated by configurator.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
ClusterName=ic-design-cluster-001
SlurmctldHost=offline-almalinux8-193-computing.icinfra.cn
SlurmctldHost=offline-almalinux8-194-computing.icinfra.cn
#
#DisableRootJobs=NO
#EnforcePartLimits=NO
#Epilog=
#EpilogSlurmctld=
#FirstJobId=1
#MaxJobId=67043328
#GresTypes=
#GroupUpdateForce=0
#GroupUpdateTime=600
#JobFileAppend=0
#JobRequeue=1
#JobSubmitPlugins=lua
#KillOnBadExit=0
#LaunchType=launch/slurm
#Licenses=foo*4,bar
#MailProg=/bin/mail
#MaxJobCount=10000
#MaxStepCount=40000
#MaxTasksPerNode=512
#MpiDefault=
#MpiParams=ports=#-#
#PluginDir=
#PlugStackConfig=
#PrivateData=jobs
ProctrackType=proctrack/cgroup
#Prolog=
#PrologFlags=
#PrologSlurmctld=
#PropagatePrioProcess=0
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#RebootProgram=
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
#SrunEpilog=
#SrunProlog=
StateSaveLocation=/var/spool/slurmctld
#SwitchType=
#TaskEpilog=
TaskPlugin=task/affinity,task/cgroup
#TaskProlog=
#TopologyPlugin=topology/tree
#TmpFS=/tmp
#TrackWCKey=no
#TreeWidth=
#UnkillableStepProgram=
#UsePAM=0
#
#
# TIMERS
#BatchStartTimeout=10
#CompleteWait=0
#EpilogMsgTime=2000
#GetEnvTimeout=2
#HealthCheckInterval=0
#HealthCheckProgram=
InactiveLimit=0
KillWait=30
#MessageTimeout=10
#ResvOverRun=0
MinJobAge=300
#OverTimeLimit=0
SlurmctldTimeout=120
SlurmdTimeout=300
#UnkillableStepTimeout=60
#VSizeFactor=0
Waittime=0
#
#
# SCHEDULING
#DefMemPerCPU=0
#MaxMemPerCPU=0
#SchedulerTimeSlice=30
SchedulerType=sched/backfill
SelectType=select/cons_tres
#
#
# JOB PRIORITY
#PriorityFlags=
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=
#PriorityCalcPeriod=
#PriorityFavorSmall=
#PriorityMaxAge=
#PriorityUsageResetPeriod=
#PriorityWeightAge=
#PriorityWeightFairshare=
#PriorityWeightJobSize=
#PriorityWeightPartition=
#PriorityWeightQOS=
#
#
# LOGGING AND ACCOUNTING
#AccountingStorageEnforce=0
#AccountingStorageHost=
#AccountingStoragePass=
#AccountingStoragePort=
#AccountingStorageType=
#AccountingStorageUser=
#AccountingStoreFlags=
#JobCompHost=
#JobCompLoc=
#JobCompParams=
#JobCompPass=
#JobCompPort=
JobCompType=jobcomp/none
#JobCompUser=
#JobContainerType=
JobAcctGatherFrequency=30
#JobAcctGatherType=
SlurmctldDebug=info
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#SlurmSchedLogFile=
#SlurmSchedLogLevel=
#DebugFlags=
#
#
# POWER SAVE SUPPORT FOR IDLE NODES (optional)
#SuspendProgram=
#ResumeProgram=
#SuspendTimeout=
#ResumeTimeout=
#ResumeRate=
#SuspendExcNodes=
#SuspendExcParts=
#SuspendRate=
#SuspendTime=
#
#
# COMPUTE NODES
NodeName=offline-almalinux8-19[5-7]-computing.icinfra.cn State=UNKNOWN
PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP
|
运行Controller
前台运行,看是否有报错
1
2
| [cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld -D
slurmctld: fatal: mkdir(/var/spool/slurmctld): Permission denied
|
提示无权限,这里创建并修改owner,
1
| [cloud-user@offline-almalinux8-193-computing ~]$ sudo mkdir /var/spool/slurmctld && sudo chown slurm /var/spool/slurmctld
|
前台运行
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
| [cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld -D
slurmctld: error: Configured MailProg is invalid
slurmctld: slurmctld version 23.11.0 started on cluster ic-design-cluster-001
slurmctld: error: _shutdown_bu_thread:send/recv offline-almalinux8-194-computing.icinfra.cn: Connection refused
slurmctld: No memory enforcing mechanism configured.
slurmctld: error: Could not open node state file /var/spool/slurmctld/node_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Information may be lost!
slurmctld: No node state file (/var/spool/slurmctld/node_state.old) to recover
slurmctld: error: Could not open job state file /var/spool/slurmctld/job_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Jobs may be lost!
slurmctld: No job state file (/var/spool/slurmctld/job_state.old) to recover
slurmctld: select/cons_tres: select_p_node_init: select/cons_tres SelectTypeParameters not specified, using default value: CR_Core_Memory
slurmctld: select/cons_tres: part_data_create_array: select/cons_tres: preparing for 1 partitions
slurmctld: error: Could not open reservation state file /var/spool/slurmctld/resv_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Reservations may be lost
slurmctld: No reservation state file (/var/spool/slurmctld/resv_state.old) to recover
slurmctld: error: Could not open trigger state file /var/spool/slurmctld/trigger_state: No such file or directory
slurmctld: error: NOTE: Trying backup state save file. Triggers may be lost!
slurmctld: No trigger state file (/var/spool/slurmctld/trigger_state.old) to recover
slurmctld: Reinitializing job accounting state
slurmctld: select/cons_tres: select_p_reconfigure: select/cons_tres: reconfigure
slurmctld: select/cons_tres: part_data_create_array: select/cons_tres: preparing for 1 partitions
slurmctld: Running as primary controller
|
后台运行
1
| [cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld
|
运行Database
略
运行Compute Nodes
1
| [cloud-user@offline-almalinux8-193-computing ~]$ sudo /tools/OSS/Slurm/23.11.0/sbin/slurmctld
|
使用
查看任务队列
1
2
3
| [cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/squeue
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
2 debug wrap cloud-us R 1:27:49 1 offline-almalinux8-195-computing.icinfra.cn
|
查看Slurm集群信息
1
2
3
4
5
6
7
8
9
10
| [cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sinfo
PARTITION AVAIL TIMELIMIT NODES STATE NODELIST
debug* up infinite 1 alloc offline-almalinux8-195-computing.icinfra.cn
debug* up infinite 2 idle offline-almalinux8-196-computing.icinfra.cn,offline-almalinux8-197-computing.icinfra.cn
[cloud-user@offline-almalinux8-194-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sinfo -Nl
Tue Nov 28 23:53:20 2023
NODELIST NODES PARTITION STATE CPUS S:C:T MEMORY TMP_DISK WEIGHT AVAIL_FE REASON
offline-almalinux8-195-computing.icinfra.cn 1 debug* allocated 1 1:1:1 1 0 1 (null) none
offline-almalinux8-196-computing.icinfra.cn 1 debug* allocated 1 1:1:1 1 0 1 (null) none
offline-almalinux8-197-computing.icinfra.cn 1 debug* idle 1 1:1:1 1 0 1 (null) none
|
提交作业
1
| [cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sbatch --wrap="sleep 36000"
|
查看作业
1
2
3
| [cloud-user@offline-almalinux8-197-computing ~]$ /tools/OSS/Slurm/23.11.0/bin/sstat 2
JobID MaxVMSize MaxVMSizeNode MaxVMSizeTask AveVMSize MaxRSS MaxRSSNode MaxRSSTask AveRSS MaxPages MaxPagesNode MaxPagesTask AvePages MinCPU MinCPUNode MinCPUTask AveCPU NTasks AveCPUFreq ReqCPUFreqMin ReqCPUFreqMax ReqCPUFreqGov ConsumedEnergy MaxDiskRead MaxDiskReadNode MaxDiskReadTask AveDiskRead MaxDiskWrite MaxDiskWriteNode MaxDiskWriteTask AveDiskWrite TRESUsageInAve TRESUsageInMax TRESUsageInMaxNode TRESUsageInMaxTask TRESUsageInMin TRESUsageInMinNode TRESUsageInMinTask TRESUsageInTot TRESUsageOutAve TRESUsageOutMax TRESUsageOutMaxNode TRESUsageOutMaxTask TRESUsageOutMin TRESUsageOutMinNode TRESUsageOutMinTask TRESUsageOutTot
------------ ---------- -------------- -------------- ---------- ---------- ---------- ---------- ---------- -------- ------------ -------------- ---------- ---------- ---------- ---------- ---------- -------- ---------- ------------- ------------- ------------- -------------- ------------ --------------- --------------- ------------ ------------ ---------------- ---------------- ------------ -------------- -------------- ------------------ ------------------ -------------- ------------------ ------------------ -------------- --------------- --------------- ------------------- ------------------- --------------- ------------------- ------------------- ---------------
|
参考资料
https://slurm.schedmd.com/quickstart_admin.html#build_install #slurm的构建与安装
https://www.icinfra.cn/blog/2023/setting-up-munge-on-almalinux8/ #munge的安装