Source code for buildtest.scheduler.slurm

import logging
import re
import time

from buildtest.scheduler.job import Job
from buildtest.utils.command import BuildTestCommand

logger = logging.getLogger(__name__)


[docs]class SlurmJob(Job): """The SlurmJob class models a Slurm Job ID with helper methods to perform operation against an active slurm job. The SlurmJob class can poll job to get updated job state, gather job data upon completion of test and cancel job if necessary. We can also retrieve job state and determine if job is running, pending, suspended, or cancelled. Jobs are polled via `sacct <https://slurm.schedmd.com/sacct.html>`_ command which can retrieve pending, running and complete jobs. """ def __init__(self, jobID, slurm_cmds, cluster=None): super().__init__(jobID) self.cluster = cluster self.slurm_cmds = slurm_cmds
[docs] def is_pending(self): """If job is pending return ``True`` otherwise return ``False``. Slurm Job state for pending is ``PENDING``.""" return self._state == "PENDING"
[docs] def is_running(self): """If job is running return ``True`` otherwise return ``False``. Slurm will report ``RUNNING`` for job state.""" return self._state == "RUNNING"
[docs] def is_suspended(self): """If job is suspended return ``True`` otherwise return ``False``. Slurm will report ``SUSPENDED`` for job state.""" return self._state == "SUSPENDED"
[docs] def is_cancelled(self): """If job is cancelled return ``True`` otherwise return ``False``. Slurm will report ``CANCELLED`` for job state.""" return self._state == "CANCELLED"
[docs] def is_complete(self): """If job is complete return ``True`` otherwise return ``False``. Slurm will report ``COMPLETED`` for job state.""" return self._state in ["COMPLETED", "FAILED", "TIMEOUT", "OUT_OF_MEMORY"]
[docs] def is_failed(self): """If job failed return ``True`` otherwise return ``False``. Slurm will report ``FAILED`` for job state.""" return self._state == "FAILED"
[docs] def is_out_of_memory(self): """If job is out of memory return ``True`` otherwise return ``False``. Slurm will report ``OUT_OF_MEMORY`` for job state.""" return self._state == "OUT_OF_MEMORY"
[docs] def is_timeout(self): """If job timed out return ``True`` otherwise return ``False``. Slurm will report ``TIMEOUT`` for job state.""" return self._state == "TIMEOUT"
[docs] def complete(self): """This method is used for gathering job result we assume job is complete if it's in any of the following state: ``COMPLETED``, ``FAILED``, ``OUT_OF_MEMORY``, ``TIMEOUT`` """ return any( [ self.is_complete(), self.is_failed(), self.is_out_of_memory(), self.is_timeout(), ] )
[docs] def state(self): """Return job state""" return self._state
[docs] def workdir(self): """Return job work directory""" return self._workdir
[docs] def cancel(self): """Cancel job by running ``scancel <jobid>``. If job is specified to a slurm cluster we cancel job using ``scancel <jobid> --clusters=<cluster>``. This method is called if job exceeds `maxpendtime`.""" query = f"{self.slurm_cmds['scancel']} {self.jobid}" if self.cluster: query = ( f"{self.slurm_cmds['scancel']} {self.jobid} --clusters={self.cluster}" ) cmd = BuildTestCommand(query) cmd.execute() logger.debug(f"Cancelling Job: {self.jobid} by running: {query}") self.poll() self._state = "CANCELLED"
[docs] def poll(self): """This method will poll job via ``sacct`` command to get updated job state by running the following command: ``sacct -j <jobid> -o State -n -X -P`` Slurm will report the job state that can be parsed. Shown below is an example job that is ``PENDING`` state .. code-block:: console $ sacct -j 46641229 -o State -n -X -P PENDING """ query = f"{self.slurm_cmds['sacct']} -j {self.jobid} -o State -n -X -P" if self.cluster: query += f" --clusters={self.cluster}" # there is a delay when test is run until slurm can query job via 'sacct'. This is relevant when using # 1 sec pollinterval. The sacct query will not return the job state so we sleep and try until we get value while True: cmd = BuildTestCommand(query) cmd.execute() logger.debug(f"Querying JobID: '{self.jobid}' by running: '{query}'") output = cmd.get_output() self._state = "".join(output).rstrip() if self._state: logger.debug(f"JobID: '{self.jobid}' Job State: {self._state}") break logger.debug( f"Unable to get job state for JobID: '{self.jobid}' so trying again" ) time.sleep(0.1) if self.is_running() and not self.starttime: self.starttime = time.time()
[docs] def get_output_and_error_files(self): """This method will extract file paths to StdOut and StdErr using ``scontrol show job <jobid>`` command that will be used to set output and error file. .. code-block:: console siddiq90@login07> scontrol show job 23608796 JobId=23608796 JobName=perlmutter-gpu.slurm UserId=siddiq90(92503) GroupId=siddiq90(92503) MCS_label=N/A Priority=69119 Nice=0 Account=nstaff_g QOS=gpu_debug JobState=PENDING Reason=Priority Dependency=(null) Requeue=0 Restarts=0 BatchFlag=1 Reboot=0 ExitCode=0:0 RunTime=00:00:00 TimeLimit=00:05:00 TimeMin=N/A SubmitTime=2024-03-28T12:36:05 EligibleTime=2024-03-28T12:36:05 AccrueTime=2024-03-28T12:36:05 StartTime=2024-03-28T12:36:14 EndTime=2024-03-28T12:41:14 Deadline=N/A SuspendTime=None SecsPreSuspend=0 LastSchedEval=2024-03-28T12:36:12 Scheduler=Backfill:* Partition=gpu_ss11 AllocNode:Sid=login07:1529462 ReqNodeList=(null) ExcNodeList=(null) NodeList= NumNodes=1-1 NumCPUs=4 NumTasks=4 CPUs/Task=1 ReqB:S:C:T=0:0:*:* ReqTRES=cpu=4,mem=229992M,node=1,billing=4,gres/gpu=1 AllocTRES=(null) Socks/Node=* NtasksPerN:B:S:C=4:0:*:* CoreSpec=* MinCPUsNode=4 MinMemoryNode=0 MinTmpDiskNode=0 Features=gpu&a100 DelayBoot=00:00:00 OverSubscribe=NO Contiguous=0 Licenses=u1:1 Network=(null) Command=/global/u1/s/siddiq90/jobs/perlmutter-gpu.slurm WorkDir=/global/u1/s/siddiq90/jobs StdErr=/global/u1/s/siddiq90/jobs/slurm-23608796.out StdIn=/dev/null StdOut=/global/u1/s/siddiq90/jobs/slurm-23608796.out Power= TresPerJob=gres:gpu:1 """ query = f"{self.slurm_cmds['scontrol']} show job {self.jobid}" if self.cluster: query += f" --clusters={self.cluster}" cmd = BuildTestCommand(query) cmd.execute() logger.debug(f"Querying JobID: '{self.jobid}' by running: '{query}'") content = " ".join(cmd.get_output()) logger.debug(f"Output of scontrol show job {self.jobid}:\n{content}") pattern = r"StdOut=(?P<stdout>.+)" match = re.search(pattern, content) logger.debug( f"Extracting StdOut file by applying regular expression: {pattern}" ) if match: self._outfile = match.group("stdout") else: logger.error(f"Unable to extract StdOut file from output: {content}") pattern = r"StdErr=(?P<stderr>.+)" match = re.search(pattern, content) logger.debug( f"Extracting StdOut file by applying regular expression: {pattern}" ) if match: self._errfile = match.group("stderr") else: logger.error(f"Unable to extract StdErr file from error: {content}") logger.debug(f"Output File: {self._outfile}") logger.debug(f"Error File: {self._errfile}")
[docs] def retrieve_jobdata(self): """This method will get job record which is called after job completion. We use `sacct` to gather job record and return the job record as a dictionary. The command we run is ``sacct -j <jobid> -X -n -P -o <field1>,<field2>,...,<fieldN>``. We retrieve the following format fields from job record: - "Account" - "AllocNodes" - "AllocTRES" - "ConsumedEnergyRaw" - "CPUTimeRaw" - "Elapsed" - "ElapsedRaw" - "End" - "ExitCode" - "JobID" - "JobName" - "NCPUS" - "NNodes" - "QOS" - "ReqMem" - "ReqNodes" - "Start" - "State" - "Submit" - "UID" - "User" - "WorkDir" The output of sacct is parseable using the pipe symbol (**|**) and stored into a dict .. code-block:: console $ sacct -j 42909266 -X -n -P -o Account,AllocNodes,AllocTRES,ConsumedEnergyRaw,CPUTimeRaw,Elapsed,End,ExitCode,JobID,JobName,NCPUS,NNodes,QOS,ReqMem,ReqNodes,Start,State,Submit,UID,User,WorkDir --clusters=cori nstaff|1|billing=272,cpu=272,energy=262,mem=87G,node=1|262|2176|00:00:08|2021-05-27T18:47:49|0:0|42909266|slurm_metadata|272|1|debug_knl|87Gn|1|2021-05-27T18:47:41|COMPLETED|2021-05-27T18:44:07|92503|siddiq90|/global/u1/s/siddiq90/.buildtest/tests/cori.slurm.knl_debug/metadata/slurm_metadata/0/stage We retrieve ExitCode and WorkDir via sacct command to get returncode. Slurm will write output and error file in WorkDir location. We run the following command below and parse the output. The ExitCode is in form ``<exitcode>:<signal>`` which is colon separated list. For more details on Slurm Exit code see https://slurm.schedmd.com/job_exit_code.html .. code-block:: console $ sacct -j 46294283 --clusters=cori -X -n -P -o ExitCode,Workdir 0:0|/global/u1/s/siddiq90/github/buildtest/var/tests/cori.slurm.knl_debug/hostname/hostname_knl/cd39a853/stage """ sacct_fields = [ "Account", "AllocNodes", "AllocTRES", "ConsumedEnergyRaw", "CPUTimeRaw", "Elapsed", "ElapsedRaw", "End", "ExitCode", "JobID", "JobName", "NCPUS", "NNodes", "QOS", "ReqMem", "ReqNodes", "Start", "State", "Submit", "UID", "User", "WorkDir", ] query = ( f"{self.slurm_cmds['sacct']} -j {self.jobid} -X -n -P -o ExitCode,Workdir" ) if self.cluster: query += f" --clusters={self.cluster}" cmd = BuildTestCommand(query) cmd.execute() logger.debug( f"Querying JobID: '{self.jobid}' ExitCode and Workdir by running: '{query}'" ) out = "".join(cmd.get_output()).rstrip() exitcode, workdir = out.split("|") # Exit Code field is in format <ExitCode>:<Signal> for now we care only about first number self._exitcode = int(exitcode.split(":")[0]) self._workdir = workdir logger.debug(f"JobID: '{self.jobid}' finished with exitcode: {self._exitcode}") query = f"{self.slurm_cmds['sacct']} -j {self.jobid} -X -n -P -o {','.join(sacct_fields)}" # to query jobs from another cluster we must add -M <cluster> to sacct if self.cluster: query += f" --clusters={self.cluster}" logger.debug(f"Gather slurm job data by running: {query}") cmd = BuildTestCommand(query) cmd.execute() out = "".join(cmd.get_output()) # split by | since out = out.split("|") job_data = {} for field, value in zip(sacct_fields, out): job_data[field] = value self._jobdata = job_data