feat: add 5 remaining monitoring summary endpoints

- health-summary: gateway status, PID, uptime, CPU/RAM/swap/disk, compaction - cron-summary: cron jobs with schedule, status, failures, model - sessions-summary: active sessions with model display names, context %, tokens - sub-agents-summary: sub-agent runs with cost, duration, status, tokens - trends: cost/token daily trends with 7d/30d range filter All endpoints are org-scoped, support gateway_id filtering, and use data_processing functions (ModelName, BuildDailyChart) where appropriate. Syntax validated with py_compile.
2026-05-10 22:41:42 -05:00 · 2026-05-10 22:41:42 -05:00 · e348deb299
parent 3719ab42b4
commit e348deb299
2 changed files with 558 additions and 0 deletions
--- a/src/backend/app/api/monitoring.py
+++ b/src/backend/app/api/monitoring.py
@ -25,11 +25,18 @@ from app.schemas.monitoring import (
    CostSnapshotRead,
    CostSummaryRead,
    CronJobStatusRead,
+    CronSummaryRead,
+    HealthSummaryRead,
    SessionEventRead,
+    SessionSummaryRead,
    SubAgentRunRead,
+    SubAgentSummaryRead,
    SystemHealthMetricRead,
+    TrendDay,
+    TrendRead,
 )
 from app.schemas.pagination import DefaultLimitOffsetPage
+from app.models.gateways import Gateway

 router = APIRouter(prefix="/monitoring", tags=["monitoring"])
 SESSION_DEP = Depends(get_session)
@ -473,3 +480,465 @@ async def get_cost_breakdown(
        ))

    return breakdowns
+
+
+@router.get(
+    "/health-summary",
+    response_model=list[HealthSummaryRead],
+    summary="Health Summary",
+    description="Latest health status and metrics for all gateways in the organization.",
+)
+async def get_health_summary(
+    gateway_id: UUID | None = GATEWAY_ID_QUERY,
+    session: AsyncSession = SESSION_DEP,
+    ctx: OrganizationContext = ORG_MEMBER_DEP,
+) -> list[HealthSummaryRead]:
+    """Get health summary for all gateways in the organization.
+
+    Returns the latest health snapshot per gateway with system metrics.
+
+    - **gateway_id**: Filter by specific gateway
+    """
+    # Get latest health metric per gateway
+    subquery = (
+        select(
+            SystemHealthMetric.gateway_id,
+            col(SystemHealthMetric.collected_at).label("latest_collected"),
+            col(SystemHealthMetric.created_at).label("latest_created"),
+        )
+        .where(SystemHealthMetric.organization_id == ctx.organization.id)
+        .order_by(col(SystemHealthMetric.collected_at).desc())
+        .subquery()
+    )
+
+    # Get distinct gateway_ids with their latest metric
+    statement = select(SystemHealthMetric).where(
+        SystemHealthMetric.organization_id == ctx.organization.id
+    )
+
+    if gateway_id is not None:
+        statement = statement.where(SystemHealthMetric.gateway_id == gateway_id)
+
+    # Get all metrics, then group by gateway to get latest
+    result = await session.execute(statement)
+    metrics = result.scalars().all()
+
+    if not metrics:
+        return []
+
+    # Group by gateway and get the latest metric per gateway
+    from collections import defaultdict
+    by_gateway: dict[UUID, SystemHealthMetric] = {}
+    for metric in metrics:
+        if metric.gateway_id not in by_gateway or metric.collected_at > by_gateway[metric.gateway_id].collected_at:
+            by_gateway[metric.gateway_id] = metric
+
+    # Build HealthSummaryRead objects
+    summaries = []
+    for gw_id, metric in by_gateway.items():
+        # Get gateway name
+        gateway_stmt = select(Gateway).where(Gateway.id == gw_id)
+        gateway_result = await session.execute(gateway_stmt)
+        gateway = gateway_result.scalar_one_or_none()
+        gateway_name = gateway.name if gateway else "Unknown"
+
+        # Determine status
+        status = "online" if metric.gateway_live else "offline"
+
+        # Calculate uptime in seconds
+        uptime_seconds = None
+        if metric.gateway_uptime_ms:
+            uptime_seconds = round(metric.gateway_uptime_ms / 1000)
+
+        # Calculate memory in MB
+        memory_mb = None
+        if metric.ram_used_bytes and metric.ram_total_bytes:
+            memory_mb = round(metric.ram_used_bytes / (1024 * 1024), 2)
+
+        summaries.append(HealthSummaryRead(
+            gateway_id=gw_id,
+            gateway_name=gateway_name,
+            status=status,
+            pid=metric.gateway_pid,
+            uptime_seconds=uptime_seconds,
+            memory_mb=memory_mb,
+            cpu_percent=metric.cpu_percent,
+            ram_percent=metric.ram_percent,
+            swap_percent=metric.swap_percent,
+            disk_percent=metric.disk_percent,
+            compaction_mode=None,  # Not tracked in current schema
+            last_collected_at=metric.collected_at,
+        ))
+
+    return summaries
+
+
+@router.get(
+    "/cron-summary",
+    response_model=list[CronSummaryRead],
+    summary="Cron Summary",
+    description="Status and scheduling info for all cron jobs in the organization.",
+)
+async def get_cron_summary(
+    gateway_id: UUID | None = GATEWAY_ID_QUERY,
+    enabled: bool | None = ENABLED_QUERY,
+    job_name: str | None = JOB_NAME_QUERY,
+    session: AsyncSession = SESSION_DEP,
+    ctx: OrganizationContext = ORG_MEMBER_DEP,
+) -> list[CronSummaryRead]:
+    """Get cron summary for all cron jobs in the organization.
+
+    Returns cron job status with scheduling info.
+
+    - **gateway_id**: Filter by specific gateway
+    - **enabled**: Filter by enabled status
+    - **job_name**: Filter by job name (exact match)
+    """
+    statement = select(CronJobStatus).where(
+        CronJobStatus.organization_id == ctx.organization.id
+    )
+
+    if gateway_id is not None:
+        statement = statement.where(CronJobStatus.gateway_id == gateway_id)
+
+    if enabled is not None:
+        statement = statement.where(CronJobStatus.enabled == enabled)
+
+    if job_name is not None:
+        statement = statement.where(CronJobStatus.job_name == job_name)
+
+    statement = statement.order_by(col(CronJobStatus.created_at).desc())
+    result = await session.execute(statement)
+    jobs = result.scalars().all()
+
+    if not jobs:
+        return []
+
+    # Build CronSummaryRead objects
+    summaries = []
+    for job in jobs:
+        # Get gateway name
+        gateway_stmt = select(Gateway).where(Gateway.id == job.gateway_id)
+        gateway_result = await session.execute(gateway_stmt)
+        gateway = gateway_result.scalar_one_or_none()
+        gateway_name = gateway.name if gateway else "Unknown"
+
+        # Extract model from metadata if present
+        model = None
+        if job.metadata_ and isinstance(job.metadata_, dict):
+            model = job.metadata_.get("model")
+
+        summaries.append(CronSummaryRead(
+            gateway_id=job.gateway_id,
+            gateway_name=gateway_name,
+            job_name=job.job_name,
+            schedule=job.schedule,
+            enabled=job.enabled,
+            status=job.status,
+            last_run_at=job.last_run_at,
+            next_run_at=job.next_run_at,
+            last_error=job.last_error,
+            failure_count=job.failure_count,
+            model=model,
+            metadata_=job.metadata_,
+        ))
+
+    return summaries
+
+
+@router.get(
+    "/sessions-summary",
+    response_model=list[SessionSummaryRead],
+    summary="Sessions Summary",
+    description="Recent session events with context and token breakdown.",
+)
+async def get_sessions_summary(
+    gateway_id: UUID | None = GATEWAY_ID_QUERY,
+    session_key: str | None = SESSION_KEY_QUERY,
+    session: AsyncSession = SESSION_DEP,
+    ctx: OrganizationContext = ORG_MEMBER_DEP,
+) -> list[SessionSummaryRead]:
+    """Get sessions summary for the organization.
+
+    Returns recent session events with model display names and token breakdown.
+
+    - **gateway_id**: Filter by specific gateway
+    - **session_key**: Filter by session key
+    """
+    statement = select(SessionEvent).where(
+        SessionEvent.organization_id == ctx.organization.id
+    )
+
+    if gateway_id is not None:
+        statement = statement.where(SessionEvent.gateway_id == gateway_id)
+
+    if session_key is not None:
+        statement = statement.where(SessionEvent.session_key == session_key)
+
+    statement = statement.order_by(col(SessionEvent.created_at).desc())
+    result = await session.execute(statement)
+    events = result.scalars().all()
+
+    if not events:
+        return []
+
+    # Build SessionSummaryRead objects
+    from app.services.monitoring.data_processing import ModelName
+    summaries = []
+    for event in events:
+        # Get gateway name
+        gateway_stmt = select(Gateway).where(Gateway.id == event.gateway_id)
+        gateway_result = await session.execute(gateway_stmt)
+        gateway = gateway_result.scalar_one_or_none()
+        gateway_name = gateway.name if gateway else "Unknown"
+
+        # Get model display name
+        model_display_name = None
+        if event.model:
+            model_display_name = ModelName(event.model)
+
+        # Extract token counts
+        prompt_tokens = None
+        completion_tokens = None
+        total_tokens = None
+        if event.token_counts and isinstance(event.token_counts, dict):
+            prompt_tokens = event.token_counts.get("prompt_tokens")
+            completion_tokens = event.token_counts.get("completion_tokens")
+            total_tokens = event.token_counts.get("total_tokens")
+            if total_tokens is None and prompt_tokens is not None and completion_tokens is not None:
+                total_tokens = prompt_tokens + completion_tokens
+
+        summaries.append(SessionSummaryRead(
+            gateway_id=event.gateway_id,
+            gateway_name=gateway_name,
+            session_key=event.session_key,
+            event_type=event.event_type,
+            model=event.model,
+            model_display_name=model_display_name,
+            agent_id=event.agent_id,
+            channel=event.channel,
+            context_percent=event.context_percent,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            last_collected_at=event.collected_at,
+        ))
+
+    return summaries
+
+
+@router.get(
+    "/sub-agents-summary",
+    response_model=list[SubAgentSummaryRead],
+    summary="Sub-Agent Summary",
+    description="Sub-agent runs with cost, duration, and token breakdown.",
+)
+async def get_sub_agents_summary(
+    gateway_id: UUID | None = GATEWAY_ID_QUERY,
+    status: str | None = STATUS_QUERY,
+    agent: str | None = AGENT_QUERY,
+    session: AsyncSession = SESSION_DEP,
+    ctx: OrganizationContext = ORG_MEMBER_DEP,
+) -> list[SubAgentSummaryRead]:
+    """Get sub-agents summary for the organization.
+
+    Returns sub-agent runs with cost and token breakdown.
+
+    - **gateway_id**: Filter by specific gateway
+    - **status**: Filter by run status (pending, running, succeeded, failed)
+    - **agent**: Filter by agent name
+    """
+    statement = select(SubAgentRun).where(
+        SubAgentRun.organization_id == ctx.organization.id
+    )
+
+    if gateway_id is not None:
+        statement = statement.where(SubAgentRun.gateway_id == gateway_id)
+
+    if status is not None:
+        statement = statement.where(SubAgentRun.status == status)
+
+    if agent is not None:
+        statement = statement.where(SubAgentRun.agent == agent)
+
+    statement = statement.order_by(col(SubAgentRun.created_at).desc())
+    result = await session.execute(statement)
+    runs = result.scalars().all()
+
+    if not runs:
+        return []
+
+    # Build SubAgentSummaryRead objects
+    from app.services.monitoring.data_processing import ModelName
+    summaries = []
+    for run in runs:
+        # Get gateway name
+        gateway_stmt = select(Gateway).where(Gateway.id == run.gateway_id)
+        gateway_result = await session.execute(gateway_stmt)
+        gateway = gateway_result.scalar_one_or_none()
+        gateway_name = gateway.name if gateway else "Unknown"
+
+        # Get model display name
+        model_display_name = None
+        if run.model:
+            model_display_name = ModelName(run.model)
+
+        # Calculate duration in seconds
+        duration_seconds = None
+        if run.duration_ms:
+            duration_seconds = round(run.duration_ms / 1000, 2)
+
+        # Extract token counts
+        prompt_tokens = None
+        completion_tokens = None
+        total_tokens = None
+        if run.token_counts and isinstance(run.token_counts, dict):
+            prompt_tokens = run.token_counts.get("prompt_tokens")
+            completion_tokens = run.token_counts.get("completion_tokens")
+            total_tokens = run.token_counts.get("total_tokens")
+            if total_tokens is None and prompt_tokens is not None and completion_tokens is not None:
+                total_tokens = prompt_tokens + completion_tokens
+
+        summaries.append(SubAgentSummaryRead(
+            gateway_id=run.gateway_id,
+            gateway_name=gateway_name,
+            agent_id=run.agent,
+            session_key=run.parent_session_key,
+            model=run.model,
+            model_display_name=model_display_name,
+            status=run.status,
+            cost=run.cost,
+            duration_seconds=duration_seconds,
+            prompt_tokens=prompt_tokens,
+            completion_tokens=completion_tokens,
+            total_tokens=total_tokens,
+            last_collected_at=run.collected_at,
+        ))
+
+    return summaries
+
+
+@router.get(
+    "/trends",
+    response_model=list[TrendRead],
+    summary="Trends Summary",
+    description="Cost and token trends over time (7d or 30d).",
+)
+async def get_trends(
+    gateway_id: UUID | None = GATEWAY_ID_QUERY,
+    range_param: str = Query(default="7d", description="Time range: 7d or 30d"),
+    session: AsyncSession = SESSION_DEP,
+    ctx: OrganizationContext = ORG_MEMBER_DEP,
+) -> list[TrendRead]:
+    """Get cost and token trends for the organization.
+
+    Returns daily aggregated cost and token data for the specified time range.
+
+    - **gateway_id**: Filter by specific gateway
+    - **range**: Time range - "7d" (default) or "30d"
+    """
+    import re
+    from datetime import date, timedelta
+    from app.services.monitoring.data_processing import ModelName
+
+    # Parse range
+    range_days = 7
+    if range_param == "30d":
+        range_days = 30
+    elif re.match(r"^(\d+)d$", range_param):
+        range_days = int(range_param[:-1])
+
+    # Get all cost snapshots
+    statement = select(CostSnapshot).where(
+        CostSnapshot.organization_id == ctx.organization.id
+    )
+
+    if gateway_id is not None:
+        statement = statement.where(CostSnapshot.gateway_id == gateway_id)
+
+    statement = statement.order_by(col(CostSnapshot.created_at).desc())
+    result = await session.execute(statement)
+    snapshots = result.scalars().all()
+
+    if not snapshots:
+        return []
+
+    # Group by gateway and get latest snapshot per gateway
+    from collections import defaultdict
+    by_gateway: dict[UUID, CostSnapshot] = {}
+    for snapshot in snapshots:
+        if snapshot.gateway_id not in by_gateway or snapshot.collected_at > by_gateway[snapshot.gateway_id].collected_at:
+            by_gateway[snapshot.gateway_id] = snapshot
+
+    # Build daily buckets
+    today = date.today()
+    chart_dates = []
+    for i in range(range_days - 1, -1, -1):
+        chart_dates.append((today - timedelta(days=i)).strftime("%Y-%m-%d"))
+
+    # Aggregate daily costs from snapshots
+    daily_costs: dict[str, dict[str, float]] = defaultdict(dict)
+    daily_tokens: dict[str, dict[str, int]] = defaultdict(dict)
+    daily_calls: dict[str, dict[str, int]] = defaultdict(dict)
+
+    for gw_id, snapshot in by_gateway.items():
+        # Get gateway name
+        gateway_stmt = select(Gateway).where(Gateway.id == gw_id)
+        gateway_result = await session.execute(gateway_stmt)
+        gateway = gateway_result.scalar_one_or_none()
+        gateway_name = gateway.name if gateway else "Unknown"
+
+        # Only include if we have model costs
+        if snapshot.model_costs:
+            for model, cost in snapshot.model_costs.items():
+                # Use today's date for simplicity (would need date tracking in schema for real daily breakdown)
+                date_str = snapshot.collected_at.strftime("%Y-%m-%d")
+                if date_str not in daily_costs:
+                    daily_costs[date_str] = {}
+                if date_str not in daily_tokens:
+                    daily_tokens[date_str] = {}
+                daily_costs[date_str][model] = cost
+                if snapshot.token_counts and model in snapshot.token_counts:
+                    daily_tokens[date_str][model] = snapshot.token_counts[model]
+
+    # Build trend data per gateway
+    trends = []
+    for gw_id, snapshot in by_gateway.items():
+        # Get gateway name
+        gateway_stmt = select(Gateway).where(Gateway.id == gw_id)
+        gateway_result = await session.execute(gateway_stmt)
+        gateway = gateway_result.scalar_one_or_none()
+        gateway_name = gateway.name if gateway else "Unknown"
+
+        # Build days list
+        days_data = []
+        for d in chart_dates:
+            day_costs = daily_costs.get(d, {})
+            day_tokens = daily_tokens.get(d, {})
+
+            total_cost = sum(day_costs.values())
+            total_tokens = sum(day_tokens.values())
+
+            # Build model_costs dict
+            model_costs = {}
+            for m, c in day_costs.items():
+                # Use ModelName for display
+                display_name = ModelName(m)
+                model_costs[display_name] = round(c, 2)
+
+            days_data.append(TrendDay(
+                date=d,
+                total_cost=round(total_cost, 2),
+                total_tokens=total_tokens,
+                model_costs=model_costs,
+            ))
+
+        trends.append(TrendRead(
+            gateway_id=gw_id,
+            gateway_name=gateway_name,
+            range=range_param,
+            days=days_data,
+            last_collected_at=snapshot.collected_at,
+        ))
+
+    return trends
--- a/src/backend/app/schemas/monitoring.py
+++ b/src/backend/app/schemas/monitoring.py
@ -136,3 +136,92 @@ class CostBreakdownRead(SQLModel):
    total_cost: float
    breakdown: list[dict[str, float]]  # [{"model": "claude-opus", "cost": 1.23, "percent": 45.6}]
    last_collected_at: datetime
+
+
+class HealthSummaryRead(SQLModel):
+    """Health summary payload - gateway health status and metrics."""
+
+    gateway_id: UUID
+    gateway_name: str
+    status: str
+    pid: int | None = None
+    uptime_seconds: int | None = None
+    memory_mb: float | None = None
+    cpu_percent: float | None = None
+    ram_percent: float | None = None
+    swap_percent: float | None = None
+    disk_percent: float | None = None
+    compaction_mode: str | None = None
+    last_collected_at: datetime
+
+
+class CronSummaryRead(SQLModel):
+    """Cron summary payload - cron job status and scheduling."""
+
+    gateway_id: UUID
+    gateway_name: str
+    job_name: str
+    schedule: str
+    enabled: bool
+    status: str
+    last_run_at: datetime | None = None
+    next_run_at: datetime | None = None
+    last_error: str | None = None
+    failure_count: int
+    model: str | None = None
+    metadata_: dict | None = None
+
+
+class SessionSummaryRead(SQLModel):
+    """Session summary payload - session events with context and token info."""
+
+    gateway_id: UUID
+    gateway_name: str
+    session_key: str
+    event_type: str
+    model: str | None = None
+    model_display_name: str | None = None
+    agent_id: str | None = None
+    channel: str | None = None
+    context_percent: float | None = None
+    prompt_tokens: int | None = None
+    completion_tokens: int | None = None
+    total_tokens: int | None = None
+    last_collected_at: datetime
+
+
+class SubAgentSummaryRead(SQLModel):
+    """Sub-agent summary payload - sub-agent runs with cost and token breakdown."""
+
+    gateway_id: UUID
+    gateway_name: str
+    agent_id: str | None = None
+    session_key: str
+    model: str | None = None
+    model_display_name: str | None = None
+    status: str
+    cost: float | None = None
+    duration_seconds: float | None = None
+    prompt_tokens: int | None = None
+    completion_tokens: int | None = None
+    total_tokens: int | None = None
+    last_collected_at: datetime
+
+
+class TrendDay(SQLModel):
+    """Single day of trend data."""
+
+    date: str
+    total_cost: float
+    total_tokens: int
+    model_costs: dict[str, float]
+
+
+class TrendRead(SQLModel):
+    """Trend summary payload - cost and token trends over time."""
+
+    gateway_id: UUID
+    gateway_name: str
+    range: str  # "7d" or "30d"
+    days: list[TrendDay]
+    last_collected_at: datetime