Commit e2b6bc0e authored by Renato Figueiro Maia's avatar Renato Figueiro Maia
Browse files

Melhorias nos logs gerados pelo daemon do SGA.

[SOMA-6476]
parent 1018da8d
Pipeline #71445 failed with stages
in 1 minute and 16 seconds
......@@ -159,6 +159,7 @@ local function startapp(self)
self.logger:debug(msg)
local res, err = fn()
if not res or math.floor(res/100) == 4 then
self.logger:error("Failed " .. msg .. ": " .. tostring(err))
res, err = self.client:register(sga_type, self.driver:get_nodes(), self.server)
if not res then
self.logger:error("Failed registering: " .. err)
......
......@@ -90,8 +90,9 @@ function joblist:terminate()
end
end
function joblist.new(config)
function joblist.new(config, logger)
local self = {
logger = logger,
filename = config.runtime_data_dir.."/sga_jobs_persisted_data."..config.driver,
terminated = false,
gc_coro = false,
......@@ -119,6 +120,7 @@ function joblist.new(config)
local now = os.time()
for jid, job in pairs(jobs) do
if job.done and now > job.done + 60 then
self.logger:debug("Job " .. jid .. " discarded 1 minute after it completed.")
jobs[jid] = nil
self:save()
end
......
......@@ -42,11 +42,13 @@ local Server = safer.readonly {
if not job.done then
table.insert(persisted_jobs.lost, { cmd_id = job.cmd_id } )
cleanup_job(self, job)
self.logger:error("Job " .. job.jid .. " lost due to completion check failure: " .. tostring(err))
end
elseif not is_done then
table.insert(persisted_jobs.retrieved, { cmd_id = job.cmd_id, actions = create_action_uris(self, job.jid) } )
else
cleanup_job(self, job)
self.logger:info("Job " .. job.jid .. " completed")
end
end
end
......@@ -63,7 +65,7 @@ function server.new(config, logger, client, driver)
local self = {
rs = restserver:new():host(config.sgad_bind_addr or "0.0.0.0"):port(config.sgad_port),
joblist = joblist.new(config),
joblist = joblist.new(config, logger),
base_uri = "http://"..config.sgad_host..":"..config.sgad_port,
config = config,
logger = logger,
......@@ -90,14 +92,15 @@ function server.new(config, logger, client, driver)
copas.sleep(self.config.register_retry_s)
end
cleanup_job(self, job)
self.logger:info("Command "..job.jid.." done")
self.logger:info("Job "..job.jid.." completed")
else
cleanup_job(self, job)
self.logger:warn("Job "..job.jid.." completed, but not reported due to lack of execution time information")
end
break
elseif is_done == nil then
self.logger:error(walltime_s)
cleanup_job(self, job)
self.logger:error("Job "..job.jid.." lost due to completion check failure: "..tostring(walltime_s))
break
end
end
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment