Add life support: systemd services + heartbeat timer
- symbiont-api.service: always-on API server, auto-restart on crash - symbiont-heartbeat.timer: 5-min health checks + queue processing - heartbeat.py: CLI auth check, disk check, ledger stats, queue drain Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
71041b92b1
commit
b7e026acc6
4
ledger.jsonl
Normal file
4
ledger.jsonl
Normal file
@ -0,0 +1,4 @@
|
||||
{"timestamp": "2026-03-19T19:33:35.578559", "model": "haiku", "success": true, "elapsed_seconds": 5.66, "input_tokens": 9, "output_tokens": 351, "estimated_cost_usd": 0.00975, "prompt_preview": "Classify this task:\n\nExtract email addresses from: Contact hello@example.com or support@test.org"}
|
||||
{"timestamp": "2026-03-19T19:33:39.317944", "model": "haiku", "success": true, "elapsed_seconds": 3.74, "input_tokens": 10, "output_tokens": 145, "estimated_cost_usd": 0.008121, "prompt_preview": "Extract email addresses from: Contact hello@example.com or support@test.org"}
|
||||
{"timestamp": "2026-03-19T19:33:47.049069", "model": "haiku", "success": true, "elapsed_seconds": 7.73, "input_tokens": 9, "output_tokens": 515, "estimated_cost_usd": 0.005704, "prompt_preview": "Classify this task:\n\nWrite a 3-sentence product description for an AI task router that saves money by using cheaper models"}
|
||||
{"timestamp": "2026-03-19T19:33:53.207966", "model": "sonnet", "success": true, "elapsed_seconds": 6.16, "input_tokens": 3, "output_tokens": 139, "estimated_cost_usd": 0.038423, "prompt_preview": "Write a 3-sentence product description for an AI task router that saves money by using cheaper models"}
|
||||
24
symbiont-api.service
Normal file
24
symbiont-api.service
Normal file
@ -0,0 +1,24 @@
|
||||
[Unit]
|
||||
Description=Symbiont AI Orchestrator API
|
||||
After=network-online.target
|
||||
Wants=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=simple
|
||||
WorkingDirectory=/data/symbiont
|
||||
ExecStart=/usr/bin/python3 -m symbiont.main --serve --host 127.0.0.1 --port 8111
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
# If I crash 5 times in 60 seconds, stop trying (something is fundamentally wrong)
|
||||
StartLimitIntervalSec=60
|
||||
StartLimitBurst=5
|
||||
# Environment
|
||||
Environment=PYTHONUNBUFFERED=1
|
||||
Environment=PYTHONPATH=/data/symbiont
|
||||
# Logging
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=symbiont-api
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
12
symbiont-heartbeat.service
Normal file
12
symbiont-heartbeat.service
Normal file
@ -0,0 +1,12 @@
|
||||
[Unit]
|
||||
Description=Symbiont Heartbeat - queue processing and self-diagnostics
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Type=oneshot
|
||||
WorkingDirectory=/data/symbiont
|
||||
ExecStart=/usr/bin/python3 -m symbiont.heartbeat
|
||||
Environment=PYTHONPATH=/data/symbiont
|
||||
StandardOutput=journal
|
||||
StandardError=journal
|
||||
SyslogIdentifier=symbiont-heartbeat
|
||||
10
symbiont-heartbeat.timer
Normal file
10
symbiont-heartbeat.timer
Normal file
@ -0,0 +1,10 @@
|
||||
[Unit]
|
||||
Description=Symbiont Heartbeat Timer - every 5 minutes
|
||||
|
||||
[Timer]
|
||||
OnBootSec=30
|
||||
OnUnitActiveSec=5min
|
||||
AccuracySec=30s
|
||||
|
||||
[Install]
|
||||
WantedBy=timers.target
|
||||
170
symbiont/heartbeat.py
Normal file
170
symbiont/heartbeat.py
Normal file
@ -0,0 +1,170 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Heartbeat: Periodic self-check and queue processing.
|
||||
|
||||
Run by systemd timer every 5 minutes. This is Symbiont's autonomic nervous system:
|
||||
- Process any pending tasks in the queue
|
||||
- Check rate limit status and clear expired limits
|
||||
- Log a heartbeat to the ledger for uptime tracking
|
||||
- Basic self-diagnostics
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [heartbeat] %(levelname)s: %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
HEARTBEAT_LOG = Path("/data/symbiont/heartbeat.jsonl")
|
||||
LEDGER_PATH = Path("/data/symbiont/ledger.jsonl")
|
||||
|
||||
|
||||
def check_claude_cli():
|
||||
"""Verify Claude Code CLI is authenticated and responsive."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["claude", "auth", "status"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
output = result.stdout.strip()
|
||||
if '"loggedIn": true' in output:
|
||||
return {"status": "ok", "detail": "authenticated"}
|
||||
else:
|
||||
return {"status": "error", "detail": "not authenticated"}
|
||||
except Exception as e:
|
||||
return {"status": "error", "detail": str(e)}
|
||||
|
||||
|
||||
def check_disk():
|
||||
"""Check available disk space."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["df", "-h", "/data"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
lines = result.stdout.strip().split("\n")
|
||||
if len(lines) >= 2:
|
||||
parts = lines[1].split()
|
||||
return {
|
||||
"status": "ok",
|
||||
"total": parts[1],
|
||||
"used": parts[2],
|
||||
"available": parts[3],
|
||||
"use_pct": parts[4],
|
||||
}
|
||||
except Exception as e:
|
||||
return {"status": "error", "detail": str(e)}
|
||||
|
||||
|
||||
def check_api_server():
|
||||
"""Check if the API server is running."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["systemctl", "is-active", "symbiont-api"],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
)
|
||||
active = result.stdout.strip()
|
||||
return {"status": "ok" if active == "active" else "down", "detail": active}
|
||||
except Exception as e:
|
||||
return {"status": "error", "detail": str(e)}
|
||||
|
||||
|
||||
def get_ledger_stats():
|
||||
"""Quick summary of today's ledger activity."""
|
||||
if not LEDGER_PATH.exists():
|
||||
return {"calls_today": 0, "cost_today": 0}
|
||||
|
||||
today = datetime.now().strftime("%Y-%m-%d")
|
||||
calls = 0
|
||||
cost = 0.0
|
||||
|
||||
for line in LEDGER_PATH.read_text().strip().split("\n"):
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
if entry.get("timestamp", "").startswith(today):
|
||||
calls += 1
|
||||
cost += entry.get("estimated_cost_usd", 0)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
return {"calls_today": calls, "cost_today": round(cost, 4)}
|
||||
|
||||
|
||||
def process_queue():
|
||||
"""Process pending tasks if any exist."""
|
||||
try:
|
||||
sys.path.insert(0, "/data/symbiont")
|
||||
from symbiont.scheduler import get_pending_tasks, mark_task_done
|
||||
from symbiont.router import route_task
|
||||
|
||||
tasks = get_pending_tasks()
|
||||
if not tasks:
|
||||
return {"processed": 0}
|
||||
|
||||
processed = 0
|
||||
for task_entry in tasks:
|
||||
task_id = task_entry["id"]
|
||||
result = route_task(task_entry["task"])
|
||||
if result["success"]:
|
||||
mark_task_done(task_id, result["output"])
|
||||
processed += 1
|
||||
elif result["rate_limited"]:
|
||||
logger.info("Rate limited, will retry next heartbeat")
|
||||
break
|
||||
else:
|
||||
mark_task_done(task_id, f"ERROR: {result['error']}")
|
||||
processed += 1
|
||||
|
||||
return {"processed": processed, "remaining": len(tasks) - processed}
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
|
||||
|
||||
def run_heartbeat():
|
||||
"""Run all checks and log the heartbeat."""
|
||||
logger.info("Heartbeat starting")
|
||||
|
||||
heartbeat = {
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"claude_cli": check_claude_cli(),
|
||||
"disk": check_disk(),
|
||||
"api_server": check_api_server(),
|
||||
"ledger": get_ledger_stats(),
|
||||
"queue": process_queue(),
|
||||
}
|
||||
|
||||
# Determine overall health
|
||||
checks = [heartbeat["claude_cli"]["status"], heartbeat["api_server"]["status"]]
|
||||
if all(s == "ok" for s in checks):
|
||||
heartbeat["health"] = "healthy"
|
||||
elif any(s == "error" for s in checks):
|
||||
heartbeat["health"] = "degraded"
|
||||
else:
|
||||
heartbeat["health"] = "unhealthy"
|
||||
|
||||
# Log it
|
||||
HEARTBEAT_LOG.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(HEARTBEAT_LOG, "a") as f:
|
||||
f.write(json.dumps(heartbeat) + "\n")
|
||||
|
||||
logger.info(f"Health: {heartbeat['health']} | "
|
||||
f"CLI: {heartbeat['claude_cli']['status']} | "
|
||||
f"API: {heartbeat['api_server']['status']} | "
|
||||
f"Queue processed: {heartbeat['queue'].get('processed', 0)} | "
|
||||
f"Today's calls: {heartbeat['ledger']['calls_today']} "
|
||||
f"(${heartbeat['ledger']['cost_today']})")
|
||||
|
||||
return heartbeat
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_heartbeat()
|
||||
29
test_router.py
Normal file
29
test_router.py
Normal file
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python3
|
||||
"""End-to-end test of the Symbiont router."""
|
||||
|
||||
import sys
|
||||
import json
|
||||
|
||||
sys.path.insert(0, "/data/symbiont")
|
||||
from symbiont.router import route_task
|
||||
|
||||
print("=" * 60)
|
||||
print("SYMBIONT ROUTER - END TO END TEST")
|
||||
print("=" * 60)
|
||||
|
||||
# Test 1: Simple task (should route to Haiku)
|
||||
print()
|
||||
print("--- Test 1: Simple extraction task ---")
|
||||
result = route_task("Extract email addresses from: Contact hello@example.com or support@test.org")
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
# Test 2: Medium task (should route to Sonnet)
|
||||
print()
|
||||
print("--- Test 2: Content writing task ---")
|
||||
result = route_task("Write a 3-sentence product description for an AI task router that saves money by using cheaper models")
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
print()
|
||||
print("=" * 60)
|
||||
print("TESTS COMPLETE")
|
||||
print("=" * 60)
|
||||
Loading…
Reference in New Issue
Block a user