From 6806a9aa1b9efed5f6fd21d4f2f46e238f72007f Mon Sep 17 00:00:00 2001 From: Renato Maia Date: Tue, 17 Nov 2020 18:17:36 -0300 Subject: [PATCH] Status de job informa processos terminados. [SOMA-6610][SOMA-6489] --- sga/driver/posix.lua | 113 +++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 58 deletions(-) diff --git a/sga/driver/posix.lua b/sga/driver/posix.lua index 37caf25..e9d49c3 100644 --- a/sga/driver/posix.lua +++ b/sga/driver/posix.lua @@ -17,6 +17,47 @@ local function start_file(self, jid) return self.config.runtime_data_dir.."/"..jid..".start" end +local function process_tree(pid) + return coroutine.wrap(function () + local queue = { pid } + local processed = {} + while #queue > 0 do + local curpid = table.remove(queue, 1) + processed[curpid] = true + local children = procdata.get_children(curpid) + for _, child in ipairs(children) do + if not processed[child] then + table.insert(queue, child) + end + end + coroutine.yield(curpid) + end + end) +end + +local function get_running_processes(job) + local pids = {} + for pid in process_tree(job.data.pid) do + local pinfo = procdata.get_process_info(pid) + if pinfo then + if not job.data.pinfo[pid] then -- new process found + table.insert(job.data.plist, pid) + end + job.data.pinfo[pid] = pinfo -- update process info + if pinfo.state == "Z" then -- process is a zombie + local term = wait.wait(pid, wait.WNOHANG) -- end zombie + if term == pid then -- zombie process terminated + pinfo = nil -- mark process as not running + end + end + if pinfo then + table.insert(pids, pid) -- add to list of running processes + end + end + end + return pids +end + --- -- Execute a new command. -- @param job The job object: job.data is a writable table for driver data. @@ -85,31 +126,13 @@ function posix.execute_command(self, job, cmd_string, user_token) os.exit(0) else job.data.pid = pid + job.data.plist = { pid } + job.data.pinfo = { [pid] = assert(procdata.get_process_info(pid)) } self.logger:debug("Created PID "..pid.." for jid "..job.jid) return true end end -local function is_command_alive(self, job) - local pid = job.data.pid - -- Check that it is the one we started: (detect pid rotation) - local pinfo, err = procdata.get_process_info(pid) - if pinfo then - local pidstart = util.read_file(start_file(self, job.jid)) - -- if pidstart and tonumber(pidstart) == tonumber(pinfo.starttime) then - if pinfo.state == "Z" then - job.data.pinfo = pinfo - local term = wait.wait(pid, wait.WNOHANG) - if term == pid then - return false - end - end - return true - -- end - end - return false -end - -- -- Checks if a command has finished. -- @param job The job object @@ -121,7 +144,7 @@ function posix.is_command_done(self, job) end return true, nil end - if is_command_alive(self, job) then + if #get_running_processes(job) > 0 then return false else local donetime, err = util.read_file(done_file(self, job.jid)) @@ -153,36 +176,15 @@ function posix.cleanup_job(self, job) end end -local function process_tree(pid) - return coroutine.wrap(function () - local queue = { pid } - local processed = {} - while #queue > 0 do - local curpid = table.remove(queue, 1) - processed[curpid] = true - local children = procdata.get_children(curpid) - for _, child in ipairs(children) do - if not processed[child] then - table.insert(queue, child) - end - end - coroutine.yield(curpid) - end - end) -end - posix.actions = { -- Terminates a command -- @param job The job object terminate = function(self, job) - if is_command_alive(self, job) then - local pids = {} - for pid in process_tree(job.data.pid) do - table.insert(pids, 1, pid) - end - for _, pid in ipairs(pids) do - signal.kill(pid, signal.SIGTERM) + local pids = get_running_processes(job) + if #pids > 0 then + for i = #pids, 1, -1 do + signal.kill(pids[i], signal.SIGTERM) end job.data.killed = true return true @@ -195,27 +197,22 @@ posix.actions = { -- @param job The job object -- @return A table with information for each command component (process) status = function(self, job) - if not is_command_alive(self, job) then - return false, "Job is not running" - end - local processes = {} self.logger:debug("Got a status request for command " .. job.jid .. " (pid " ..job.data.pid .. ")") - for pid in process_tree(job.data.pid) do - local pinfo = procdata.get_process_info(pid) - - if not pinfo then - break - end - + local pstates = {} + for _, pid in ipairs(get_running_processes(job)) do + pstates[pid] = "RUNNING" + end + for _, pid in ipairs(job.data.plist) do + local pinfo = job.data.pinfo[pid] -- FIXME return more information, don't return fake information processes[#processes + 1] = { pid = pid, ppid = pinfo.ppid, exec_host = "", string = pinfo.comm, - state = "RUNNING", + state = pstates[pid] or "FINISHED", processor_id = "", memory_ram_size_mb = pinfo.rss / (1024 * 1024), memory_swap_size_mb = 0, -- GitLab