跳到主要内容

Observability Stack

The iHospita observability stack provides comprehensive monitoring, logging, and tracing capabilities using Prometheus, Loki, Tempo, and Grafana.


Architecture

┌─────────────────────────────────────────────────────────────────────┐
│ OBSERVABILITY STACK │
├─────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────────────────────────────────────────────────────┐ │
│ │ GRAFANA │ │
│ │ - Dashboards - Alerting - Exploration │ │
│ │ Port: 3000 │ │
│ └───────────────────────────────────────────────────────────────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Prometheus │ │ Loki │ │ Tempo │ │
│ │ (Metrics) │ │ (Logs) │ │ (Traces) │ │
│ │ Port: 9090 │ │ Port: 3100 │ │ Port: 3200 │ │
│ └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │
│ │ Targets │ │ Promtail │ │ Services │ │
│ │ /metrics │ │ (Log Agent) │ │ (OTLP) │ │
│ └─────────────┘ └─────────────┘ └─────────────┘ │
│ │
│ ┌─────────────────────────────────────────────────────────────┐ │
│ │ ALERTMANAGER │ │
│ │ - Alert Routing - Notifications - Silencing │ │
│ │ Port: 9093 │ │
│ └─────────────────────────────────────────────────────────────┘ │
│ │
└─────────────────────────────────────────────────────────────────────┘

Docker Compose Configuration

# docker-compose.observability.yml
version: '3.8'

services:
prometheus:
image: prom/prometheus:v2.48.0
container_name: ihospita-prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/etc/prometheus/console_libraries'
- '--web.console.templates=/etc/prometheus/consoles'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-lifecycle'
ports:
- "9090:9090"
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/alerts:/etc/prometheus/alerts
- prometheus_data:/prometheus
networks:
- ihospita-network

loki:
image: grafana/loki:2.9.0
container_name: ihospita-loki
ports:
- "3100:3100"
command: -config.file=/etc/loki/local-config.yaml
volumes:
- ./loki/loki-config.yaml:/etc/loki/local-config.yaml
- loki_data:/loki
networks:
- ihospita-network

promtail:
image: grafana/promtail:2.9.0
container_name: ihospita-promtail
volumes:
- ./promtail/promtail-config.yaml:/etc/promtail/config.yml
- /var/log:/var/log:ro
- /var/lib/docker/containers:/var/lib/docker/containers:ro
command: -config.file=/etc/promtail/config.yml
networks:
- ihospita-network

tempo:
image: grafana/tempo:2.3.0
container_name: ihospita-tempo
command: ["-config.file=/etc/tempo/tempo.yaml"]
ports:
- "3200:3200" # tempo
- "4317:4317" # otlp grpc
- "4318:4318" # otlp http
volumes:
- ./tempo/tempo.yaml:/etc/tempo/tempo.yaml
- tempo_data:/var/tempo
networks:
- ihospita-network

grafana:
image: grafana/grafana:10.2.0
container_name: ihospita-grafana
environment:
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER}
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD}
GF_USERS_ALLOW_SIGN_UP: false
ports:
- "3000:3000"
volumes:
- ./grafana/provisioning:/etc/grafana/provisioning
- grafana_data:/var/lib/grafana
networks:
- ihospita-network

alertmanager:
image: prom/alertmanager:v0.26.0
container_name: ihospita-alertmanager
ports:
- "9093:9093"
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
networks:
- ihospita-network

networks:
ihospita-network:
external: true

volumes:
prometheus_data:
loki_data:
tempo_data:
grafana_data:

Prometheus Configuration

prometheus.yml

global:
scrape_interval: 15s
evaluation_interval: 15s

alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093

rule_files:
- /etc/prometheus/alerts/*.yml

scrape_configs:
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']

# Kong API Gateway
- job_name: 'kong'
static_configs:
- targets: ['kong:8001']
metrics_path: /metrics

# NestJS Services
- job_name: 'hms-service'
static_configs:
- targets: ['hms-service:3000']
metrics_path: /metrics

- job_name: 'crm-service'
static_configs:
- targets: ['crm-service:3001']
metrics_path: /metrics

- job_name: 'payment-service'
static_configs:
- targets: ['payment-service:3002']
metrics_path: /metrics

- job_name: 'queue-service'
static_configs:
- targets: ['queue-service:3003']
metrics_path: /metrics

- job_name: 'report-service'
static_configs:
- targets: ['report-service:3004']
metrics_path: /metrics

# PostgreSQL
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']

# Redis
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']

# Keycloak
- job_name: 'keycloak'
static_configs:
- targets: ['keycloak:8080']
metrics_path: /metrics

Alert Rules

alerts.yml

groups:
- name: ihospita-alerts
rules:
# High Error Rate
- alert: HighErrorRate
expr: |
sum(rate(http_requests_total{status=~"5.."}[5m]))
/ sum(rate(http_requests_total[5m])) > 0.05
for: 5m
labels:
severity: critical
annotations:
summary: "High error rate detected"
description: "Error rate is above 5% for 5 minutes"

# Service Down
- alert: ServiceDown
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"

# High Latency
- alert: HighLatency
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket[5m]))
by (le, service)
) > 2
for: 5m
labels:
severity: warning
annotations:
summary: "High latency on {{ $labels.service }}"

# Database Connection Pool
- alert: DatabaseConnectionPoolExhausted
expr: pg_stat_activity_count > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Database connection pool nearly exhausted"

# Redis Memory
- alert: RedisHighMemory
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "Redis memory usage above 80%"

# Disk Space
- alert: DiskSpaceLow
expr: node_filesystem_avail_bytes / node_filesystem_size_bytes < 0.1
for: 10m
labels:
severity: warning
annotations:
summary: "Disk space below 10%"

Grafana Dashboards

Dashboard: API Performance

{
"dashboard": {
"title": "iHospita API Performance",
"panels": [
{
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total[5m])) by (service)",
"legendFormat": "{{service}}"
}
]
},
{
"title": "Error Rate",
"type": "graph",
"targets": [
{
"expr": "sum(rate(http_requests_total{status=~\"5..\"}[5m])) by (service)",
"legendFormat": "{{service}}"
}
]
},
{
"title": "P95 Latency",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, service))",
"legendFormat": "{{service}}"
}
]
}
]
}
}

Loki Configuration

loki-config.yaml

auth_enabled: false

server:
http_listen_port: 3100

ingester:
lifecycler:
ring:
kvstore:
store: inmemory
replication_factor: 1
chunk_idle_period: 5m
chunk_retain_period: 30s

schema_config:
configs:
- from: 2023-01-01
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h

storage_config:
boltdb_shipper:
active_index_directory: /loki/boltdb-shipper-active
cache_location: /loki/boltdb-shipper-cache
shared_store: filesystem
filesystem:
directory: /loki/chunks

limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h

NestJS Metrics Integration

// metrics.module.ts
import { Module } from '@nestjs/common';
import { PrometheusModule } from '@willsoto/nestjs-prometheus';

@Module({
imports: [
PrometheusModule.register({
defaultMetrics: {
enabled: true,
},
path: '/metrics',
}),
],
})
export class MetricsModule {}

Custom Metrics

import { Counter, Histogram } from 'prom-client';

// Request counter
const httpRequestsTotal = new Counter({
name: 'http_requests_total',
help: 'Total HTTP requests',
labelNames: ['method', 'path', 'status'],
});

// Request duration
const httpRequestDuration = new Histogram({
name: 'http_request_duration_seconds',
help: 'HTTP request duration',
labelNames: ['method', 'path'],
buckets: [0.1, 0.5, 1, 2, 5],
});

Key Metrics

MetricDescriptionAlert Threshold
http_requests_totalTotal HTTP requestsN/A
http_request_duration_secondsRequest latencyP95 > 2s
http_requests_errors_totalError countRate > 5%
pg_stat_activity_countDB connections> 90
redis_memory_used_bytesRedis memory> 80%
node_filesystem_avail_bytesDisk space< 10%