Observability Stack
The iHospita observability stack provides comprehensive monitoring, logging, and tracing capabilities using Prometheus, Loki, Tempo, and Grafana.
Architecture
┌─────────────────────────────────────────────────────────────────────┐
│ OBSERVABILITY STACK │
├─────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────┐ │
│ │ GRAFANA │ │
│ │ - Dashboards - Alerting - Exploration │ │
│ │ Port: 3000 │ │
│ └───────────────────────────────────────────────────────────────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Prometheus │ │ Loki │ │ Tempo │ │
│ │ (Metrics) │ │ (Logs) │ │ (Traces) │ │
│ │ Port: 9090 │ │ Port: 3100 │ │ Port: 3200 │ │
│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Targets │ │ Promtail │ │ Services │ │
│ │ /metrics │ │ (Log Agent) │ │ (OTLP) │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────┐ │
│ │ ALERTMANAGER │ │
│ │ - Alert Routing - Notifications - Silencing │ │
│ │ Port: 9093 │ │
│ └─────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────┘
Docker Compose Configuration
# docker-compose.observability.yml
version: '3.8'
services:
prometheus:
image: prom/prometheus:v2.48.0
container_name: ihospita-prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-lifecycle'
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/alerts:/etc/prometheus/alerts
- prometheus_data:/prometheus
networks:
- ihospita-network
loki:
image: grafana/loki:2.9.0
container_name: ihospita-loki
ports:
- "3100:3100"
command: -config.file=/etc/loki/local-config.yaml
volumes:
- ./loki/loki-config.yaml:/etc/loki/local-config.yaml
- loki_data:/loki
networks:
- ihospita-network
promtail:
image: grafana/promtail:2.9.0
container_name: ihospita-promtail
volumes:
- ./promtail/promtail-config.yaml:/etc/promtail/config.yml
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
command: -config.file=/etc/promtail/config.yml
networks:
- ihospita-network
tempo:
image: grafana/tempo:2.3.0
container_name: ihospita-tempo
command: ["-config.file=/etc/tempo/tempo.yaml"]
ports:
- "3200:3200" # tempo
- "4317:4317" # otlp grpc
- "4318:4318" # otlp http
volumes:
- ./tempo/tempo.yaml:/etc/tempo/tempo.yaml
- tempo_data:/var/tempo
networks:
- ihospita-network
grafana:
image: grafana/grafana:10.2.0
container_name: ihospita-grafana
environment:
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER}
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD}
GF_USERS_ALLOW_SIGN_UP: false
ports:
- "3000:3000"
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning
- grafana_data:/var/lib/grafana
networks:
- ihospita-network
alertmanager:
image: prom/alertmanager:v0.26.0
container_name: ihospita-alertmanager
ports:
- "9093:9093"
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
networks:
- ihospita-network
networks:
ihospita-network:
external: true
volumes:
prometheus_data:
loki_data:
tempo_data:
grafana_data:
Prometheus Configuration
prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- /etc/prometheus/alerts/*.yml
scrape_configs:
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# Kong API Gateway
- job_name: 'kong'
static_configs:
- targets: ['kong:8001']
metrics_path: /metrics
# NestJS Services
- job_name: 'hms-service'
static_configs:
- targets: ['hms-service:3000']
metrics_path: /metrics
- job_name: 'crm-service'
static_configs:
- targets: ['crm-service:3001']
metrics_path: /metrics
- job_name: 'payment-service'
static_configs:
- targets: ['payment-service:3002']
metrics_path: /metrics
- job_name: 'queue-service'
static_configs:
- targets: ['queue-service:3003']
metrics_path: /metrics
- job_name: 'report-service'
static_configs:
- targets: ['report-service:3004']
metrics_path: /metrics
# PostgreSQL
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
# Redis
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
# Keycloak
- job_name: 'keycloak'
static_configs:
- targets: ['keycloak:8080']
metrics_path: /metrics
Alert Rules
alerts.yml
groups:
- name: ihospita-alerts
rules:
# High Error Rate
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/ sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is above 5% for 5 minutes"
# Service Down
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
# High Latency
- alert: HighLatency
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m]))
by (le, service)
) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High latency on {{ $labels.service }}"
# Database Connection Pool
- alert: DatabaseConnectionPoolExhausted
expr: pg_stat_activity_count > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Database connection pool nearly exhausted"
# Redis Memory
- alert: RedisHighMemory
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "Redis memory usage above 80%"
# Disk Space
- alert: DiskSpaceLow
expr: node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.1
for: 10m
labels:
severity: warning
annotations:
summary: "Disk space below 10%"
Grafana Dashboards
Dashboard: API Performance
{
"dashboard": {
"title": "iHospita API Performance",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (service)",
"legendFormat": "{{service}}"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (service)",
"legendFormat": "{{service}}"
}
]
},
{
"title": "P95 Latency",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))",
"legendFormat": "{{service}}"
}
]
}
]
}
}
Loki Configuration
loki-config.yaml
auth_enabled: false
server:
http_listen_port: 3100
ingester:
lifecycler:
ring:
kvstore:
store: inmemory
replication_factor: 1
chunk_idle_period: 5m
chunk_retain_period: 30s
schema_config:
configs:
- from: 2023-01-01
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /loki/boltdb-shipper-active
cache_location: /loki/boltdb-shipper-cache
shared_store: filesystem
filesystem:
directory: /loki/chunks
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
NestJS Metrics Integration
// metrics.module.ts
import { Module } from '@nestjs/common';
import { PrometheusModule } from '@willsoto/nestjs-prometheus';
@Module({
imports: [
PrometheusModule.register({
defaultMetrics: {
enabled: true,
},
path: '/metrics',
}),
],
})
export class MetricsModule {}
Custom Metrics
import { Counter, Histogram } from 'prom-client';
// Request counter
const httpRequestsTotal = new Counter({
name: 'http_requests_total',
help: 'Total HTTP requests',
labelNames: ['method', 'path', 'status'],
});
// Request duration
const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration',
labelNames: ['method', 'path'],
buckets: [0.1, 0.5, 1, 2, 5],
});
Key Metrics
| Metric | Description | Alert Threshold |
|---|---|---|
http_requests_total | Total HTTP requests | N/A |
http_request_duration_seconds | Request latency | P95 > 2s |
http_requests_errors_total | Error count | Rate > 5% |
pg_stat_activity_count | DB connections | > 90 |
redis_memory_used_bytes | Redis memory | > 80% |
node_filesystem_avail_bytes | Disk space | < 10% |