131 lines
3.4 KiB
Elixir
131 lines
3.4 KiB
Elixir
defmodule Symbiont.Heartbeat do
|
|
@moduledoc """
|
|
Periodic health check and queue processor.
|
|
|
|
Runs on a configurable interval (default: 5 minutes).
|
|
Each tick:
|
|
1. Checks system health (API responding, disk space, ledger writable)
|
|
2. Processes pending tasks from the queue
|
|
3. Logs a health snapshot to heartbeat.jsonl
|
|
"""
|
|
use GenServer
|
|
|
|
require Logger
|
|
|
|
# -- Client API --
|
|
|
|
def start_link(opts) do
|
|
GenServer.start_link(__MODULE__, opts, name: __MODULE__)
|
|
end
|
|
|
|
@doc "Trigger a heartbeat manually (useful for testing)."
|
|
def pulse do
|
|
GenServer.call(__MODULE__, :pulse, 60_000)
|
|
end
|
|
|
|
@doc "Get the last recorded health snapshot."
|
|
def last_snapshot do
|
|
GenServer.call(__MODULE__, :last_snapshot)
|
|
end
|
|
|
|
# -- Server Callbacks --
|
|
|
|
@impl true
|
|
def init(_opts) do
|
|
interval = Application.get_env(:symbiont, :heartbeat_interval_ms, 300_000)
|
|
data_dir = Application.get_env(:symbiont, :data_dir, "data")
|
|
heartbeat_path = Path.join(data_dir, "heartbeat.jsonl")
|
|
|
|
unless File.exists?(heartbeat_path), do: File.write!(heartbeat_path, "")
|
|
|
|
# Schedule first heartbeat after a short delay (let other services start)
|
|
Process.send_after(self(), :tick, 5_000)
|
|
|
|
state = %{
|
|
interval: interval,
|
|
heartbeat_path: heartbeat_path,
|
|
last_snapshot: nil,
|
|
started_at: DateTime.utc_now()
|
|
}
|
|
|
|
{:ok, state}
|
|
end
|
|
|
|
@impl true
|
|
def handle_info(:tick, state) do
|
|
snapshot = run_heartbeat(state)
|
|
schedule_next(state.interval)
|
|
{:noreply, %{state | last_snapshot: snapshot}}
|
|
end
|
|
|
|
@impl true
|
|
def handle_call(:pulse, _from, state) do
|
|
snapshot = run_heartbeat(state)
|
|
{:reply, snapshot, %{state | last_snapshot: snapshot}}
|
|
end
|
|
|
|
@impl true
|
|
def handle_call(:last_snapshot, _from, state) do
|
|
{:reply, state.last_snapshot, state}
|
|
end
|
|
|
|
# -- Private --
|
|
|
|
defp run_heartbeat(state) do
|
|
Logger.info("Heartbeat: running health check")
|
|
|
|
# 1. Check health
|
|
queue_size = Symbiont.Queue.size()
|
|
ledger_stats = Symbiont.Ledger.stats()
|
|
|
|
# 2. Process pending tasks
|
|
max_batch = Application.get_env(:symbiont, :max_queue_batch, 5)
|
|
tasks_processed = process_queue(max_batch)
|
|
|
|
# 3. Build snapshot
|
|
snapshot = %{
|
|
"timestamp" => DateTime.utc_now() |> DateTime.to_iso8601(),
|
|
"status" => "healthy",
|
|
"queue_size" => queue_size,
|
|
"tasks_processed" => tasks_processed,
|
|
"total_calls" => ledger_stats["total_calls"],
|
|
"total_cost" => ledger_stats["total_cost_estimated_usd"],
|
|
"uptime_seconds" =>
|
|
DateTime.diff(DateTime.utc_now(), state.started_at, :second)
|
|
}
|
|
|
|
# 4. Log snapshot
|
|
line = Jason.encode!(snapshot) <> "\n"
|
|
File.write!(state.heartbeat_path, line, [:append])
|
|
|
|
Logger.info(
|
|
"Heartbeat: queue=#{queue_size} processed=#{tasks_processed} " <>
|
|
"total_cost=$#{ledger_stats["total_cost_estimated_usd"]}"
|
|
)
|
|
|
|
snapshot
|
|
end
|
|
|
|
defp process_queue(max_batch) do
|
|
tasks = Symbiont.Queue.take(max_batch)
|
|
|
|
Enum.each(tasks, fn task ->
|
|
Task.Supervisor.start_child(Symbiont.TaskSupervisor, fn ->
|
|
case Symbiont.Router.route_and_execute(task["task"]) do
|
|
{:ok, result} ->
|
|
Symbiont.Queue.complete(task["id"], result[:result])
|
|
|
|
{:error, reason} ->
|
|
Symbiont.Queue.fail(task["id"], inspect(reason))
|
|
end
|
|
end)
|
|
end)
|
|
|
|
length(tasks)
|
|
end
|
|
|
|
defp schedule_next(interval) do
|
|
Process.send_after(self(), :tick, interval)
|
|
end
|
|
end
|