コンテンツにスキップ

Vendure 監視・運用設計書

概要

Fly.io + Upstash で運用される Vendure システムの包括的な監視・運用戦略について説明します。

監視アーキテクチャ

監視対象とメトリクス

graph TB
    subgraph "Application Layer"
        V[Vendure API]
        NS[Next.js Storefront]
        W[Worker Process]
    end

    subgraph "Infrastructure Layer"
        PG[(PostgreSQL)]
        R[(Redis)]
        FS[File Storage]
    end

    subgraph "Monitoring Stack"
        PM[Prometheus]
        GF[Grafana]
        AL[AlertManager]
        LOG[Log Aggregation]
    end

    subgraph "External Services"
        DD[Datadog]
        SN[Sentry]
        UPT[Uptime Robot]
    end

    V --> PM
    NS --> PM
    W --> PM
    PG --> PM
    R --> PM
    FS --> PM

    PM --> GF
    PM --> AL
    V --> LOG
    NS --> LOG
    W --> LOG

    LOG --> DD
    V --> SN
    NS --> SN

    UPT --> V
    UPT --> NS

1. アプリケーション監視

1.1 パフォーマンスメトリクス

レスポンス時間監視

// performance-metrics.ts
import { performance } from 'perf_hooks';
import { Logger } from '@vendure/core';

export class PerformanceMonitor {
  private logger = new Logger(PerformanceMonitor.name);

  trackGraphQLQuery(operationName: string, duration: number) {
    this.logger.verbose(`GraphQL ${operationName}: ${duration}ms`);

    // Prometheus メトリクス
    graphqlDurationHistogram
      .labels({ operation: operationName })
      .observe(duration / 1000);

    // SLA閾値チェック
    if (duration > 2000) {
      this.logger.warn(
        `Slow query detected: ${operationName} took ${duration}ms`,
      );
    }
  }

  trackAPIEndpoint(
    endpoint: string,
    method: string,
    statusCode: number,
    duration: number,
  ) {
    apiDurationHistogram
      .labels({ endpoint, method, status: statusCode.toString() })
      .observe(duration / 1000);

    apiRequestsTotal
      .labels({ endpoint, method, status: statusCode.toString() })
      .inc();
  }
}

// Prometheus メトリクス定義
import { register, Histogram, Counter } from 'prom-client';

export const graphqlDurationHistogram = new Histogram({
  name: 'vendure_graphql_duration_seconds',
  help: 'GraphQL query duration',
  labelNames: ['operation'],
  buckets: [0.1, 0.5, 1, 2, 5, 10],
});

export const apiDurationHistogram = new Histogram({
  name: 'vendure_api_duration_seconds',
  help: 'API endpoint duration',
  labelNames: ['endpoint', 'method', 'status'],
  buckets: [0.1, 0.5, 1, 2, 5, 10],
});

export const apiRequestsTotal = new Counter({
  name: 'vendure_api_requests_total',
  help: 'Total API requests',
  labelNames: ['endpoint', 'method', 'status'],
});

register.registerMetric(graphqlDurationHistogram);
register.registerMetric(apiDurationHistogram);
register.registerMetric(apiRequestsTotal);

ビジネスメトリクス

// business-metrics.ts
export class BusinessMetrics {
  private logger = new Logger(BusinessMetrics.name);

  // 注文関連メトリクス
  trackOrder(order: Order) {
    orderTotal.labels({ status: order.state }).observe(order.total);

    ordersCreatedTotal
      .labels({
        customerType: this.getCustomerType(order.customer),
        channel: order.channels[0]?.code || 'default',
      })
      .inc();
  }

  // 商品関連メトリクス
  trackProductView(productId: string, customerId?: string) {
    productViewsTotal.labels({ productId }).inc();

    if (customerId) {
      customerActivityTotal
        .labels({ customerId, action: 'product_view' })
        .inc();
    }
  }

  // B2B特有メトリクス
  trackRebateCalculation(customerId: string, amount: number, period: string) {
    rebateAmountGauge.labels({ customerId, period }).set(amount);

    rebateCalculationsTotal.labels({ period }).inc();
  }

  // キャンペーン効果測定
  trackCampaignUsage(campaignId: string, discount: number) {
    campaignUsageTotal.labels({ campaignId }).inc();

    campaignDiscountTotal.labels({ campaignId }).observe(discount);
  }

  private getCustomerType(customer: Customer): string {
    // B2B顧客タイプの判定ロジック
    const customFields = customer.customFields as any;
    return customFields?.customerStatus || 'general';
  }
}

// ビジネスメトリクス定義
export const orderTotal = new Histogram({
  name: 'vendure_order_total_amount',
  help: 'Order total amount',
  labelNames: ['status'],
  buckets: [1000, 5000, 10000, 50000, 100000, 500000],
});

export const ordersCreatedTotal = new Counter({
  name: 'vendure_orders_created_total',
  help: 'Total orders created',
  labelNames: ['customerType', 'channel'],
});

export const productViewsTotal = new Counter({
  name: 'vendure_product_views_total',
  help: 'Total product views',
  labelNames: ['productId'],
});

export const rebateAmountGauge = new Gauge({
  name: 'vendure_rebate_amount',
  help: 'Customer rebate amount',
  labelNames: ['customerId', 'period'],
});

1.2 エラー監視

Sentry 統合

// error-monitoring.ts
import * as Sentry from '@sentry/node';
import { Logger } from '@vendure/core';

export class ErrorMonitor {
  static init() {
    Sentry.init({
      dsn: process.env.SENTRY_DSN,
      environment: process.env.NODE_ENV,
      tracesSampleRate: process.env.NODE_ENV === 'production' ? 0.1 : 1.0,

      beforeSend(event) {
        // 機密データのフィルタリング
        if (event.request?.data) {
          event.request.data = this.sanitizeData(event.request.data);
        }
        return event;
      },
    });
  }

  static captureException(error: Error, context?: any) {
    Sentry.withScope(scope => {
      if (context) {
        scope.setContext('additional', context);
      }
      Sentry.captureException(error);
    });
  }

  static captureMessage(message: string, level: Sentry.SeverityLevel = 'info') {
    Sentry.captureMessage(message, level);
  }

  private static sanitizeData(data: any): any {
    // パスワード、トークンなどの機密データを除去
    const sensitiveFields = ['password', 'token', 'apiKey', 'secret'];

    if (typeof data === 'object' && data !== null) {
      const sanitized = { ...data };
      for (const field of sensitiveFields) {
        if (sanitized[field]) {
          sanitized[field] = '[REDACTED]';
        }
      }
      return sanitized;
    }

    return data;
  }
}

// GraphQL エラーハンドリング
export const graphQLErrorHandler = (error: any) => {
  const logger = new Logger('GraphQLError');

  logger.error('GraphQL Error:', {
    message: error.message,
    path: error.path,
    operation: error.source?.body,
  });

  ErrorMonitor.captureException(error, {
    graphql: {
      operation: error.source?.body,
      path: error.path,
    },
  });
};

2. インフラストラクチャ監視

2.1 Fly.io メトリクス

システムリソース監視

// system-metrics.ts
import { register, Gauge } from 'prom-client';
import { execSync } from 'child_process';

export class SystemMetrics {
  private cpuUsageGauge = new Gauge({
    name: 'system_cpu_usage_percent',
    help: 'CPU usage percentage',
  });

  private memoryUsageGauge = new Gauge({
    name: 'system_memory_usage_bytes',
    help: 'Memory usage in bytes',
  });

  private diskUsageGauge = new Gauge({
    name: 'system_disk_usage_bytes',
    help: 'Disk usage in bytes',
    labelNames: ['mount'],
  });

  constructor() {
    register.registerMetric(this.cpuUsageGauge);
    register.registerMetric(this.memoryUsageGauge);
    register.registerMetric(this.diskUsageGauge);

    // 30秒ごとにメトリクス更新
    setInterval(() => this.updateMetrics(), 30000);
  }

  private updateMetrics() {
    try {
      // CPU使用率
      const cpuUsage = this.getCPUUsage();
      this.cpuUsageGauge.set(cpuUsage);

      // メモリ使用量
      const memoryUsage = this.getMemoryUsage();
      this.memoryUsageGauge.set(memoryUsage);

      // ディスク使用量
      const diskUsage = this.getDiskUsage();
      Object.entries(diskUsage).forEach(([mount, usage]) => {
        this.diskUsageGauge.labels(mount).set(usage);
      });
    } catch (error) {
      console.error('Failed to update system metrics:', error);
    }
  }

  private getCPUUsage(): number {
    try {
      const output = execSync(
        "top -bn1 | grep 'Cpu(s)' | awk '{print $2}' | cut -d'%' -f1",
      );
      return parseFloat(output.toString().trim());
    } catch {
      return 0;
    }
  }

  private getMemoryUsage(): number {
    try {
      const output = execSync("free -b | grep '^Mem:' | awk '{print $3}'");
      return parseInt(output.toString().trim());
    } catch {
      return 0;
    }
  }

  private getDiskUsage(): Record<string, number> {
    try {
      const output = execSync('df -B1 | tail -n +2');
      const lines = output.toString().trim().split('\n');
      const usage: Record<string, number> = {};

      lines.forEach(line => {
        const parts = line.split(/\s+/);
        if (parts.length >= 6) {
          const mount = parts[5];
          const used = parseInt(parts[2]);
          usage[mount] = used;
        }
      });

      return usage;
    } catch {
      return {};
    }
  }
}

2.2 データベース監視

PostgreSQL 監視

// database-metrics.ts
export class DatabaseMetrics {
  private connectionPoolGauge = new Gauge({
    name: 'postgres_connection_pool_size',
    help: 'PostgreSQL connection pool size',
    labelNames: ['state'],
  });

  private queryDurationHistogram = new Histogram({
    name: 'postgres_query_duration_seconds',
    help: 'PostgreSQL query duration',
    labelNames: ['operation'],
    buckets: [0.01, 0.05, 0.1, 0.5, 1, 2, 5],
  });

  private activeConnectionsGauge = new Gauge({
    name: 'postgres_active_connections',
    help: 'Number of active PostgreSQL connections',
  });

  constructor(private dataSource: DataSource) {
    register.registerMetric(this.connectionPoolGauge);
    register.registerMetric(this.queryDurationHistogram);
    register.registerMetric(this.activeConnectionsGauge);

    // 接続プール監視
    setInterval(() => this.updateConnectionMetrics(), 30000);
  }

  trackQuery(operation: string, duration: number) {
    this.queryDurationHistogram.labels({ operation }).observe(duration / 1000);
  }

  private async updateConnectionMetrics() {
    try {
      if (this.dataSource.isInitialized) {
        const pool = (this.dataSource.driver as any).master;

        if (pool) {
          this.connectionPoolGauge.labels('total').set(pool.totalCount || 0);
          this.connectionPoolGauge.labels('idle').set(pool.idleCount || 0);
          this.connectionPoolGauge
            .labels('waiting')
            .set(pool.waitingCount || 0);
        }

        // アクティブ接続数を取得
        const result = await this.dataSource.query(
          'SELECT count(*) as active_connections FROM pg_stat_activity WHERE state = $1',
          ['active'],
        );

        this.activeConnectionsGauge.set(
          parseInt(result[0]?.active_connections || '0'),
        );
      }
    } catch (error) {
      console.error('Failed to update database metrics:', error);
    }
  }
}

Redis 監視

// redis-metrics.ts
export class RedisMetrics {
  private connectionGauge = new Gauge({
    name: 'redis_connected_clients',
    help: 'Number of connected Redis clients',
  });

  private memoryUsageGauge = new Gauge({
    name: 'redis_memory_usage_bytes',
    help: 'Redis memory usage in bytes',
  });

  private commandsProcessedTotal = new Counter({
    name: 'redis_commands_processed_total',
    help: 'Total number of commands processed',
    labelNames: ['command'],
  });

  private keyspaceGauge = new Gauge({
    name: 'redis_keyspace_keys',
    help: 'Number of keys in Redis keyspace',
    labelNames: ['db'],
  });

  constructor(private redis: Redis) {
    register.registerMetric(this.connectionGauge);
    register.registerMetric(this.memoryUsageGauge);
    register.registerMetric(this.commandsProcessedTotal);
    register.registerMetric(this.keyspaceGauge);

    // Redis INFO 監視
    setInterval(() => this.updateRedisMetrics(), 30000);
  }

  private async updateRedisMetrics() {
    try {
      const info = await this.redis.info();
      const sections = this.parseRedisInfo(info);

      // 接続数
      if (sections.clients?.connected_clients) {
        this.connectionGauge.set(parseInt(sections.clients.connected_clients));
      }

      // メモリ使用量
      if (sections.memory?.used_memory) {
        this.memoryUsageGauge.set(parseInt(sections.memory.used_memory));
      }

      // キースペース
      Object.entries(sections.keyspace || {}).forEach(([db, info]) => {
        const match = info.match(/keys=(\d+)/);
        if (match) {
          this.keyspaceGauge.labels(db).set(parseInt(match[1]));
        }
      });
    } catch (error) {
      console.error('Failed to update Redis metrics:', error);
    }
  }

  private parseRedisInfo(info: string): Record<string, Record<string, string>> {
    const sections: Record<string, Record<string, string>> = {};
    let currentSection = '';

    info.split('\n').forEach(line => {
      line = line.trim();
      if (line.startsWith('#')) {
        currentSection = line.substring(2).toLowerCase();
        sections[currentSection] = {};
      } else if (line.includes(':')) {
        const [key, value] = line.split(':');
        if (sections[currentSection]) {
          sections[currentSection][key] = value;
        }
      }
    });

    return sections;
  }
}

3. ログ管理

3.1 構造化ログ

// structured-logging.ts
import { Logger as VendureLogger } from '@vendure/core';
import winston from 'winston';

export class StructuredLogger extends VendureLogger {
  private winston: winston.Logger;

  constructor() {
    super();

    this.winston = winston.createLogger({
      level: process.env.LOG_LEVEL || 'info',
      format: winston.format.combine(
        winston.format.timestamp(),
        winston.format.errors({ stack: true }),
        winston.format.json(),
      ),
      defaultMeta: {
        service: 'vendure-api',
        environment: process.env.NODE_ENV,
        instance: process.env.FLY_MACHINE_ID || 'local',
      },
      transports: [
        new winston.transports.Console(),
        new winston.transports.File({
          filename: '/app/logs/error.log',
          level: 'error',
        }),
        new winston.transports.File({
          filename: '/app/logs/combined.log',
        }),
      ],
    });
  }

  error(message: string, trace?: string, context?: string) {
    this.winston.error(message, {
      trace,
      context,
      level: 'error',
    });
    super.error(message, trace, context);
  }

  warn(message: string, context?: string) {
    this.winston.warn(message, {
      context,
      level: 'warn',
    });
    super.warn(message, context);
  }

  log(message: string, context?: string) {
    this.winston.info(message, {
      context,
      level: 'info',
    });
    super.log(message, context);
  }

  debug(message: string, context?: string) {
    this.winston.debug(message, {
      context,
      level: 'debug',
    });
    super.debug(message, context);
  }

  verbose(message: string, context?: string) {
    this.winston.verbose(message, {
      context,
      level: 'verbose',
    });
    super.verbose(message, context);
  }

  // ビジネスイベントログ
  logBusinessEvent(event: string, data: any, userId?: string) {
    this.winston.info('Business Event', {
      event,
      data,
      userId,
      timestamp: new Date().toISOString(),
      level: 'info',
      category: 'business',
    });
  }

  // セキュリティイベントログ
  logSecurityEvent(
    event: string,
    details: any,
    severity: 'low' | 'medium' | 'high' = 'medium',
  ) {
    this.winston.warn('Security Event', {
      event,
      details,
      severity,
      timestamp: new Date().toISOString(),
      level: 'warn',
      category: 'security',
    });
  }

  // パフォーマンスログ
  logPerformance(operation: string, duration: number, metadata?: any) {
    this.winston.info('Performance Log', {
      operation,
      duration,
      metadata,
      timestamp: new Date().toISOString(),
      level: 'info',
      category: 'performance',
    });
  }
}

3.2 ログ集約設定

# fluentd-config.yml (オプション)
<source> @type tail path /app/logs/*.log pos_file
/var/log/fluentd-vendure.log.pos tag vendure.* format json time_key timestamp
time_format %Y-%m-%dT%H:%M:%S.%LZ </source>

<match vendure.**> @type datadog api_key "#{ENV['DD_API_KEY']}" service vendure
source nodejs sourcecategory vendure tags
environment:#{ENV['NODE_ENV']},instance:#{ENV['FLY_MACHINE_ID']} </match>

4. アラート設定

4.1 クリティカルアラート

// alert-rules.ts
export const alertRules = {
  // アプリケーション可用性
  application_down: {
    metric: 'up',
    condition: '== 0',
    duration: '2m',
    severity: 'critical',
    message: 'Vendure application is down',
    channels: ['slack', 'email', 'pagerduty'],
  },

  // レスポンス時間
  high_response_time: {
    metric: 'vendure_api_duration_seconds',
    condition: '> 2',
    duration: '5m',
    severity: 'warning',
    message: 'API response time is high (>2s)',
    channels: ['slack'],
  },

  // エラー率
  high_error_rate: {
    metric: 'rate(vendure_api_requests_total{status=~"5.."}[5m])',
    condition: '> 0.05',
    duration: '3m',
    severity: 'critical',
    message: 'High error rate detected (>5%)',
    channels: ['slack', 'email'],
  },

  // データベース接続
  database_connection_high: {
    metric: 'postgres_active_connections',
    condition: '> 80',
    duration: '5m',
    severity: 'warning',
    message: 'High database connection count',
    channels: ['slack'],
  },

  // Redis メモリ使用量
  redis_memory_high: {
    metric: 'redis_memory_usage_bytes',
    condition: '> 2.5e9', // 2.5GB
    duration: '5m',
    severity: 'warning',
    message: 'Redis memory usage is high',
    channels: ['slack'],
  },

  // ディスク使用量
  disk_usage_high: {
    metric: 'system_disk_usage_bytes',
    condition: '> 0.85',
    duration: '10m',
    severity: 'warning',
    message: 'Disk usage is high (>85%)',
    channels: ['slack'],
  },

  // ビジネス指標
  order_failure_rate: {
    metric: 'rate(vendure_orders_failed_total[10m])',
    condition: '> 0.1',
    duration: '5m',
    severity: 'critical',
    message: 'Order failure rate is high (>10%)',
    channels: ['slack', 'email', 'business_team'],
  },
};

4.2 アラート通知設定

// notification-channels.ts
export class NotificationManager {
  private channels: Map<string, NotificationChannel> = new Map();

  constructor() {
    this.setupChannels();
  }

  private setupChannels() {
    // Slack 通知
    this.channels.set(
      'slack',
      new SlackChannel({
        webhookUrl: process.env.SLACK_WEBHOOK_URL!,
        channel: '#vendure-alerts',
        username: 'Vendure Monitor',
      }),
    );

    // Email 通知
    this.channels.set(
      'email',
      new EmailChannel({
        smtp: {
          host: process.env.SMTP_HOST!,
          port: 587,
          auth: {
            user: process.env.SMTP_USER!,
            pass: process.env.SMTP_PASSWORD!,
          },
        },
        from: 'alerts@ritsubi.co.jp',
        to: ['admin@ritsubi.co.jp', 'dev@ritsubi.co.jp'],
      }),
    );

    // PagerDuty 通知(クリティカル用)
    this.channels.set(
      'pagerduty',
      new PagerDutyChannel({
        integrationKey: process.env.PAGERDUTY_INTEGRATION_KEY!,
      }),
    );
  }

  async sendAlert(alert: Alert) {
    const promises = alert.channels.map(async channelName => {
      const channel = this.channels.get(channelName);
      if (channel) {
        try {
          await channel.send(alert);
        } catch (error) {
          console.error(`Failed to send alert to ${channelName}:`, error);
        }
      }
    });

    await Promise.allSettled(promises);
  }
}

interface Alert {
  severity: 'info' | 'warning' | 'critical';
  title: string;
  message: string;
  timestamp: Date;
  channels: string[];
  metadata?: any;
}

interface NotificationChannel {
  send(alert: Alert): Promise<void>;
}

class SlackChannel implements NotificationChannel {
  constructor(
    private config: {
      webhookUrl: string;
      channel: string;
      username: string;
    },
  ) {}

  async send(alert: Alert): Promise<void> {
    const color = {
      info: '#36a64f',
      warning: '#ff9900',
      critical: '#ff0000',
    }[alert.severity];

    const payload = {
      channel: this.config.channel,
      username: this.config.username,
      attachments: [
        {
          color,
          title: alert.title,
          text: alert.message,
          timestamp: Math.floor(alert.timestamp.getTime() / 1000),
          fields: [
            {
              title: 'Severity',
              value: alert.severity.toUpperCase(),
              short: true,
            },
            {
              title: 'Environment',
              value: process.env.NODE_ENV || 'unknown',
              short: true,
            },
          ],
        },
      ],
    };

    const response = await fetch(this.config.webhookUrl, {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify(payload),
    });

    if (!response.ok) {
      throw new Error(`Slack notification failed: ${response.statusText}`);
    }
  }
}

5. ヘルスチェック

5.1 アプリケーションヘルスチェック

// health-check.ts
import { Controller, Get } from '@nestjs/common';
import {
  HealthCheck,
  HealthCheckService,
  TypeOrmHealthIndicator,
  MemoryHealthIndicator,
  DiskHealthIndicator,
} from '@nestjs/terminus';

@Controller('health')
export class HealthController {
  constructor(
    private health: HealthCheckService,
    private db: TypeOrmHealthIndicator,
    private memory: MemoryHealthIndicator,
    private disk: DiskHealthIndicator,
    private redisHealth: RedisHealthIndicator,
  ) {}

  @Get()
  @HealthCheck()
  check() {
    return this.health.check([
      // データベース接続チェック
      () => this.db.pingCheck('database'),

      // Redis 接続チェック
      () => this.redisHealth.pingCheck('redis'),

      // メモリ使用量チェック
      () => this.memory.checkHeap('memory_heap', 250 * 1024 * 1024),
      () => this.memory.checkRSS('memory_rss', 500 * 1024 * 1024),

      // ディスク使用量チェック
      () =>
        this.disk.checkStorage('storage', {
          path: '/',
          thresholdPercent: 0.85,
        }),

      // カスタムビジネスロジックチェック
      () => this.customBusinessHealthCheck(),
    ]);
  }

  @Get('readiness')
  @HealthCheck()
  readiness() {
    return this.health.check([
      () => this.db.pingCheck('database'),
      () => this.redisHealth.pingCheck('redis'),
    ]);
  }

  @Get('liveness')
  @HealthCheck()
  liveness() {
    return this.health.check([
      () => this.memory.checkHeap('memory_heap', 500 * 1024 * 1024),
    ]);
  }

  private async customBusinessHealthCheck() {
    try {
      // 重要なビジネス機能の動作確認
      // 例: 商品検索機能、価格計算機能など

      const testProduct = await this.productService.findOne('test-product-id');
      if (!testProduct) {
        throw new Error('Test product not found');
      }

      const testPricing = await this.pricingService.calculatePrice(
        testProduct,
        'test-customer-id',
      );
      if (!testPricing) {
        throw new Error('Pricing calculation failed');
      }

      return {
        'business-logic': {
          status: 'up',
          details: {
            productSearch: 'ok',
            pricingCalculation: 'ok',
          },
        },
      };
    } catch (error) {
      return {
        'business-logic': {
          status: 'down',
          details: {
            error: error.message,
          },
        },
      };
    }
  }
}

class RedisHealthIndicator {
  constructor(private redis: Redis) {}

  async pingCheck(key: string) {
    try {
      const result = await this.redis.ping();
      return {
        [key]: {
          status: result === 'PONG' ? 'up' : 'down',
        },
      };
    } catch (error) {
      return {
        [key]: {
          status: 'down',
          details: error.message,
        },
      };
    }
  }
}

6. ダッシュボード設定

6.1 Grafana ダッシュボード

{
  "dashboard": {
    "title": "Vendure Production Dashboard",
    "panels": [
      {
        "title": "API Response Time",
        "type": "graph",
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(vendure_api_duration_seconds_bucket[5m]))",
            "legendFormat": "95th percentile"
          },
          {
            "expr": "histogram_quantile(0.50, rate(vendure_api_duration_seconds_bucket[5m]))",
            "legendFormat": "50th percentile"
          }
        ],
        "yAxes": [
          {
            "label": "Response Time (seconds)",
            "max": 5
          }
        ]
      },
      {
        "title": "Request Rate",
        "type": "graph",
        "targets": [
          {
            "expr": "rate(vendure_api_requests_total[5m])",
            "legendFormat": "{{endpoint}} - {{method}}"
          }
        ]
      },
      {
        "title": "Error Rate",
        "type": "singlestat",
        "targets": [
          {
            "expr": "rate(vendure_api_requests_total{status=~\"5..\"}[5m]) / rate(vendure_api_requests_total[5m]) * 100"
          }
        ],
        "valueName": "current",
        "format": "percent",
        "thresholds": "1,5",
        "colorBackground": true
      },
      {
        "title": "Database Connections",
        "type": "graph",
        "targets": [
          {
            "expr": "postgres_active_connections",
            "legendFormat": "Active Connections"
          },
          {
            "expr": "postgres_connection_pool_size{state=\"idle\"}",
            "legendFormat": "Idle Pool"
          }
        ]
      },
      {
        "title": "Redis Memory Usage",
        "type": "graph",
        "targets": [
          {
            "expr": "redis_memory_usage_bytes / 1024 / 1024",
            "legendFormat": "Memory Usage (MB)"
          }
        ]
      },
      {
        "title": "Business Metrics",
        "type": "row",
        "panels": [
          {
            "title": "Orders per Hour",
            "type": "graph",
            "targets": [
              {
                "expr": "rate(vendure_orders_created_total[1h]) * 3600",
                "legendFormat": "{{customerType}}"
              }
            ]
          },
          {
            "title": "Revenue per Hour",
            "type": "graph",
            "targets": [
              {
                "expr": "rate(vendure_order_total_amount_sum[1h]) * 3600",
                "legendFormat": "Revenue (JPY/hour)"
              }
            ]
          }
        ]
      }
    ],
    "time": {
      "from": "now-6h",
      "to": "now"
    },
    "refresh": "30s"
  }
}

7. 運用プロセス

7.1 インシデント対応

// incident-response.ts
export class IncidentResponse {
  private static readonly SEVERITY_LEVELS = {
    P1: { name: 'Critical', responseTime: 15, resolveTime: 4 * 60 }, // 15分以内対応、4時間以内解決
    P2: { name: 'High', responseTime: 60, resolveTime: 24 * 60 }, // 1時間以内対応、24時間以内解決
    P3: { name: 'Medium', responseTime: 4 * 60, resolveTime: 72 * 60 }, // 4時間以内対応、72時間以内解決
    P4: { name: 'Low', responseTime: 24 * 60, resolveTime: 168 * 60 }, // 24時間以内対応、1週間以内解決
  };

  static async handleIncident(alert: Alert) {
    const severity = this.determineSeverity(alert);
    const incident = await this.createIncident(alert, severity);

    await this.notifyOnCall(incident);
    await this.executeRunbook(incident);

    return incident;
  }

  private static determineSeverity(alert: Alert): string {
    // アラートタイプに基づく重要度判定
    const criticalPatterns = [
      'application_down',
      'database_connection_failed',
      'order_failure_rate',
    ];

    if (criticalPatterns.some(pattern => alert.title.includes(pattern))) {
      return 'P1';
    }

    if (alert.severity === 'critical') {
      return 'P1';
    } else if (alert.severity === 'warning') {
      return 'P2';
    } else {
      return 'P3';
    }
  }

  private static async executeRunbook(incident: Incident) {
    const runbook = this.getRunbook(incident.type);
    if (runbook) {
      await runbook.execute(incident);
    }
  }
}

// 自動復旧処理
export class AutoRecovery {
  static async attemptRecovery(alert: Alert): Promise<boolean> {
    switch (alert.title) {
      case 'high_memory_usage':
        return await this.restartApplication();

      case 'redis_connection_failed':
        return await this.reconnectRedis();

      case 'database_connection_high':
        return await this.killIdleConnections();

      default:
        return false;
    }
  }

  private static async restartApplication(): Promise<boolean> {
    try {
      // Fly.io アプリケーション再起動
      execSync('flyctl machine restart', { timeout: 30000 });

      // ヘルスチェック待機
      await this.waitForHealthy(60000);

      return true;
    } catch (error) {
      console.error('Auto recovery failed:', error);
      return false;
    }
  }

  private static async waitForHealthy(timeout: number): Promise<void> {
    const start = Date.now();

    while (Date.now() - start < timeout) {
      try {
        const response = await fetch('/health');
        if (response.ok) {
          return;
        }
      } catch {
        // 接続エラーは無視して継続
      }

      await new Promise(resolve => setTimeout(resolve, 5000));
    }

    throw new Error('Health check timeout');
  }
}

7.2 定期メンテナンス

#!/bin/bash
# maintenance.sh - 定期メンテナンススクリプト

# データベースメンテナンス
postgres_maintenance() {
  echo "Running PostgreSQL maintenance..."

  flyctl postgres connect -a ritsubi-vendure-db << EOF
    VACUUM ANALYZE;
    REINDEX DATABASE ritsubi_vendure;

    -- 古いログの削除
    DELETE FROM vendure_session WHERE expires_at < NOW() - INTERVAL '7 days';
    DELETE FROM vendure_job_record WHERE finished_at < NOW() - INTERVAL '30 days';
EOF
}

# Redis メンテナンス
redis_maintenance() {
  echo "Running Redis maintenance..."

  # 期限切れキーの削除
  redis-cli --scan --pattern "expired:*" | xargs redis-cli del

  # メモリ最適化
  redis-cli MEMORY PURGE
}

# ログローテーション
log_rotation() {
  echo "Rotating application logs..."

  # 古いログファイルの圧縮・削除
  find /app/logs -name "*.log" -mtime +7 -exec gzip {} \;
  find /app/logs -name "*.log.gz" -mtime +30 -delete
}

# メトリクスクリーンアップ
metrics_cleanup() {
  echo "Cleaning up old metrics..."

  # 古いPrometheusメトリクスの削除
  # (通常はPrometheusサーバー側で設定)
}

# バックアップ検証
backup_verification() {
  echo "Verifying backups..."

  # PostgreSQLバックアップ検証
  flyctl postgres backups list -a ritsubi-vendure-db

  # Redis AOF/RDB ファイル検証
  redis-cli LASTSAVE
}

# メイン実行
main() {
  echo "Starting maintenance at $(date)"

  postgres_maintenance
  redis_maintenance
  log_rotation
  metrics_cleanup
  backup_verification

  echo "Maintenance completed at $(date)"
}

main "$@"

文書バージョン: 1.0 作成日: 2025年9月17日 定期レビュー: 月次で監視設定とアラート閾値を見直し