콘텐츠로 이동

Prometheus/Grafana/Loki를 사용한 로깅 및 모니터링

DoveRunner Mobile App Security 온프레미스 솔루션을 위한 Kubernetes 로깅 및 모니터링

Section titled “DoveRunner Mobile App Security 온프레미스 솔루션을 위한 Kubernetes 로깅 및 모니터링”

이 가이드는 DoveRunner Mobile App Security 온프레미스 배포를 위한 포괄적인 관찰성 스택을 구축하는 방법을 제공합니다. Prometheus(메트릭), Grafana(시각화), Loki(로그)를 사용하여 완전한 모니터링 솔루션을 구성합니다.

Applications → Metrics → Prometheus → Grafana
Logs → Promtail → Loki → Grafana
Traces → Jaeger → Grafana
Terminal window
# Prometheus Community Helm 저장소 추가
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo update
# 네임스페이스 생성
kubectl create namespace monitoring
prometheus-values.yaml
prometheus:
prometheusSpec:
retention: 30d
retentionSize: "100GiB"
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: fast-ssd
resources:
requests:
storage: 100Gi
resources:
requests:
memory: 4Gi
cpu: 2
limits:
memory: 8Gi
cpu: 4
# DoveRunner Mobile App Security 메트릭 수집 규칙
additionalScrapeConfigs:
- job_name: 'doverunner-api'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- doverunner
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
action: keep
regex: doverunner-api
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
action: keep
regex: true
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
action: replace
regex: ([^:]+)(?::\d+)?;(\d+)
replacement: $1:$2
target_label: __address__
grafana:
adminPassword: "admin-secure-password"
persistence:
enabled: true
size: 10Gi
# 사용자 정의 대시보드
dashboardProviders:
dashboardproviders.yaml:
apiVersion: 1
providers:
- name: 'doverunner'
orgId: 1
folder: 'DoveRunner Mobile App Security'
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards/doverunner
alertmanager:
alertmanagerSpec:
storage:
volumeClaimTemplate:
spec:
storageClassName: fast-ssd
resources:
requests:
storage: 10Gi
nodeExporter:
enabled: true
kubeStateMetrics:
enabled: true
Terminal window
helm install kube-prometheus-stack prometheus-community/kube-prometheus-stack \
--namespace monitoring \
--values prometheus-values.yaml \
--wait

DoveRunner Mobile App Security ServiceMonitor 구성

Section titled “DoveRunner Mobile App Security ServiceMonitor 구성”
doverunner-servicemonitor.yaml
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: doverunner-metrics
namespace: monitoring
labels:
app: doverunner
spec:
namespaceSelector:
matchNames:
- doverunner
selector:
matchLabels:
app: doverunner
endpoints:
- port: metrics
interval: 30s
path: /actuator/prometheus
- port: metrics
interval: 30s
path: /metrics
loki-values.yaml
loki:
auth_enabled: false
server:
http_listen_port: 3100
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://kube-prometheus-stack-alertmanager:9093
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
ingestion_rate_mb: 10
ingestion_burst_size_mb: 20
# 영구 스토리지
persistence:
enabled: true
size: 50Gi
storageClassName: fast-ssd
# 리소스 제한
resources:
requests:
cpu: 1
memory: 2Gi
limits:
cpu: 2
memory: 4Gi
# Promtail 구성
promtail:
enabled: true
config:
serverPort: 3101
clients:
- url: http://loki:3100/loki/api/v1/push
scrapeConfigs:
- job_name: kubernetes-pods
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_node_name]
target_label: __host__
- action: labelmap
regex: __meta_kubernetes_pod_label_(.+)
- action: replace
replacement: $1
separator: /
source_labels:
- __meta_kubernetes_namespace
- __meta_kubernetes_pod_name
target_label: job
- action: replace
source_labels:
- __meta_kubernetes_namespace
target_label: namespace
- action: replace
source_labels:
- __meta_kubernetes_pod_name
target_label: pod
- action: replace
source_labels:
- __meta_kubernetes_pod_container_name
target_label: container
- replacement: /var/log/pods/*$1/*.log
separator: /
source_labels:
- __meta_kubernetes_pod_uid
- __meta_kubernetes_pod_container_name
target_label: __path__
# DoveRunner Mobile App Security 특화 라벨링
- job_name: doverunner-apps
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- doverunner
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app]
regex: doverunner-.*
action: keep
- source_labels: [__meta_kubernetes_pod_label_app]
target_label: doverunner_service
pipeline_stages:
- json:
expressions:
level: level
timestamp: timestamp
message: message
service: service
- labels:
level:
service:
- timestamp:
source: timestamp
format: RFC3339
Terminal window
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update
helm install loki grafana/loki-stack \
--namespace monitoring \
--values loki-values.yaml \
--wait

DoveRunner Mobile App Security 대시보드

Section titled “DoveRunner Mobile App Security 대시보드”
{
"dashboard": {
"id": null,
"title": "DoveRunner Mobile App Security - API Performance",
"tags": ["doverunner", "api"],
"timezone": "Asia/Seoul",
"panels": [
{
"id": 1,
"title": "Request Rate",
"type": "graph",
"targets": [
{
"expr": "rate(http_requests_total{job=\"doverunner-api\"}[5m])",
"legendFormat": "{{method}} {{handler}}"
}
],
"yAxes": [
{
"label": "requests/sec"
}
]
},
{
"id": 2,
"title": "Response Time",
"type": "graph",
"targets": [
{
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job=\"doverunner-api\"}[5m]))",
"legendFormat": "95th percentile"
},
{
"expr": "histogram_quantile(0.50, rate(http_request_duration_seconds_bucket{job=\"doverunner-api\"}[5m]))",
"legendFormat": "50th percentile"
}
]
},
{
"id": 3,
"title": "Error Rate",
"type": "stat",
"targets": [
{
"expr": "rate(http_requests_total{job=\"doverunner-api\",status=~\"5..\"}[5m]) / rate(http_requests_total{job=\"doverunner-api\"}[5m]) * 100",
"legendFormat": "Error Rate %"
}
]
},
{
"id": 4,
"title": "Active Sealing Jobs",
"type": "stat",
"targets": [
{
"expr": "doverunner_sealing_jobs_active",
"legendFormat": "Active Jobs"
}
]
}
],
"time": {
"from": "now-1h",
"to": "now"
},
"refresh": "30s"
}
}
{
"dashboard": {
"title": "DoveRunner Mobile App Security - Security Events",
"panels": [
{
"id": 1,
"title": "Threat Detection Rate",
"type": "graph",
"targets": [
{
"expr": "rate(doverunner_threats_detected_total[5m])",
"legendFormat": "{{threat_type}}"
}
]
},
{
"id": 2,
"title": "Top Threat Types",
"type": "piechart",
"targets": [
{
"expr": "topk(10, increase(doverunner_threats_detected_total[1h]))",
"legendFormat": "{{threat_type}}"
}
]
},
{
"id": 3,
"title": "Geographic Distribution",
"type": "worldmap",
"targets": [
{
"expr": "sum by (country) (increase(doverunner_threats_detected_total[1h]))",
"legendFormat": "{{country}}"
}
]
},
{
"id": 4,
"title": "Security Logs",
"type": "logs",
"targets": [
{
"expr": "{namespace=\"doverunner\", level=\"ERROR\"} |= \"THREAT_DETECTED\"",
"refId": "A"
}
]
}
]
}
}
doverunner-alerts.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: doverunner-alerts
namespace: monitoring
labels:
app: doverunner
spec:
groups:
- name: doverunner.rules
rules:
- alert: DoveRunnerAPIHighErrorRate
expr: rate(http_requests_total{job="doverunner-api",status=~"5.."}[5m]) / rate(http_requests_total{job="doverunner-api"}[5m]) > 0.05
for: 5m
labels:
severity: warning
service: api
annotations:
summary: "High error rate on DoveRunner Mobile App Security API"
description: "API error rate is {{ $value | humanizePercentage }} for the last 5 minutes"
- alert: DoveRunnerSealingJobsFailed
expr: increase(doverunner_sealing_jobs_failed_total[10m]) > 5
for: 0m
labels:
severity: critical
service: sealing
annotations:
summary: "Multiple sealing jobs failed"
description: "{{ $value }} sealing jobs failed in the last 10 minutes"
- alert: DoveRunnerHighThreatDetection
expr: rate(doverunner_threats_detected_total[5m]) > 100
for: 2m
labels:
severity: warning
service: security
annotations:
summary: "High threat detection rate"
description: "Threat detection rate is {{ $value }} per second"
- alert: DoveRunnerDatabaseConnectionLoss
expr: up{job="doverunner-api"} == 0
for: 1m
labels:
severity: critical
service: database
annotations:
summary: "DoveRunner Mobile App Security API is down"
description: "API service has been down for more than 1 minute"
alertmanager-config.yaml
global:
smtp_smarthost: 'smtp.gmail.com:587'
smtp_from: 'alerts@yourdomain.com'
smtp_auth_username: 'alerts@yourdomain.com'
smtp_auth_password: 'your-app-password'
route:
group_by: ['alertname', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'default-receiver'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
- match:
service: security
receiver: 'security-team'
receivers:
- name: 'default-receiver'
email_configs:
- to: 'admin@yourdomain.com'
subject: '[DoveRunner] {{ .GroupLabels.alertname }}'
body: |
{{ range .Alerts }}
Alert: {{ .Annotations.summary }}
Description: {{ .Annotations.description }}
{{ end }}
- name: 'critical-alerts'
email_configs:
- to: 'critical@yourdomain.com'
subject: '[CRITICAL] DoveRunner Mobile App Security Alert'
slack_configs:
- api_url: 'YOUR_SLACK_WEBHOOK_URL'
channel: '#alerts-critical'
title: 'Critical Alert: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
- name: 'security-team'
email_configs:
- to: 'security@yourdomain.com'
subject: '[SECURITY] Threat Detection Alert'
webhook_configs:
- url: 'http://security-automation:8080/webhook'
send_resolved: true
# 특정 서비스의 오류 로그
{namespace="doverunner", service="api"} |= "ERROR"
# 지난 1시간 동안의 위협 탐지 로그
{namespace="doverunner"} |= "THREAT_DETECTED" and [1h]
# 특정 사용자의 활동 로그
{namespace="doverunner"} |~ "user_id.*12345" and [24h]
# 실링 작업 실패 로그
{namespace="doverunner", service="sealing"} |= "SEALING_FAILED" and [6h]
# 시간당 오류 로그 수
count_over_time({namespace="doverunner", level="ERROR"}[1h])
# 서비스별 로그 비율
rate({namespace="doverunner"}[5m]) by (service)
# JSON 필드 추출 및 필터링
{namespace="doverunner"} | json | threat_level="HIGH" | line_format "{{.timestamp}} {{.device_id}} {{.threat_type}}"
# prometheus.yml에 추가
metric_relabel_configs:
- source_labels: [__name__]
regex: 'high_cardinality_metric_.*'
action: drop
- source_labels: [user_id]
regex: '.*'
target_label: user_id
replacement: 'hashed'
# Prometheus retention 정책
retention: 30d
retention_size: 100GB
# 압축 설정
tsdb:
min_block_duration: 2h
max_block_duration: 25h
retention: 30d
# 비효율적인 쿼리 (많은 시계열)
sum(rate(http_requests_total[5m])) by (method, handler, status)
# 효율적인 쿼리 (필요한 라벨만)
sum(rate(http_requests_total[5m])) by (method)
Terminal window
# 스냅샷 생성
curl -XPOST http://prometheus:9090/api/v1/admin/tsdb/snapshot
# 백업 스크립트
#!/bin/bash
BACKUP_DIR="/backup/prometheus/$(date +%Y%m%d)"
kubectl exec prometheus-0 -n monitoring -- tar -czf - /prometheus/data > $BACKUP_DIR/prometheus-data.tar.gz
Terminal window
# 대시보드 및 설정 백업
kubectl exec grafana-0 -n monitoring -- tar -czf - /var/lib/grafana > grafana-backup.tar.gz

이 가이드를 따르면 DoveRunner Mobile App Security 온프레미스 배포에 대한 포괄적인 관찰성과 모니터링 시스템을 구축할 수 있습니다.