智能监控告警与自愈实操

随着前端系统复杂度不断提升,监控与自愈能力已成为保障应用稳定性和用户体验的关键。从错误捕获到性能瓶颈定位,再到自动化修复,构建一套智能的监控告警与自愈体系,能显著提升研发效率和系统韧性。

智能监控体系的构建核心

智能监控的核心在于从“数据采集”到“洞察生成”的全链路自动化。传统监控往往停留在错误日志收集层面,而智能监控则需要整合性能指标、用户行为、业务数据等多维度信息。

一个基础的浏览器端错误监控示例,通常会使用 window.onerrorPromise 异常捕获:

javascript 复制代码
// 全局错误监听
window.onerror = function(message, source, lineno, colno, error) {
  const errorInfo = {
    message,
    source,
    lineno,
    colno,
    stack: error?.stack,
    userAgent: navigator.userAgent,
    url: window.location.href,
    timestamp: new Date().toISOString()
  };
  // 发送错误信息到监控服务器
  sendToMonitoringService('js_error', errorInfo);
  // 阻止默认错误提示
  return true;
};

// 未处理的Promise rejection
window.addEventListener('unhandledrejection', function(event) {
  const errorInfo = {
    type: 'promise_rejection',
    reason: event.reason?.toString(),
    timestamp: new Date().toISOString()
  };
  sendToMonitoringService('promise_error', errorInfo);
});

// 性能指标采集
if ('PerformanceObserver' in window) {
  const observer = new PerformanceObserver((list) => {
    for (const entry of list.getEntries()) {
      if (entry.entryType === 'largest-contentful-paint') {
        sendToMonitoringService('web_vitals', {
          name: 'LCP',
          value: entry.startTime,
          rating: entry.startTime < 2500 ? 'good' : entry.startTime < 4000 ? 'needs-improvement' : 'poor'
        });
      }
    }
  });
  observer.observe({ entryTypes: ['largest-contentful-paint'] });
}

然而,智能监控远不止于此。它需要建立用户会话(Session)级别的追踪,将分散的错误、性能指标和用户操作串联起来,还原问题发生的完整上下文。

告警策略的智能化设计

告警的智能化体现在从“基于阈值”到“基于异常检测”和“关联分析”的演进。简单的阈值告警(如错误数超过100次/分钟)容易产生噪音,智能告警则能识别异常模式。

例如,我们可以实现一个简单的基线异常检测算法,用于判断API响应时间是否异常:

javascript 复制代码
class SmartAlertSystem {
  constructor() {
    this.apiPerformanceBaseline = new Map(); // 存储API性能基线数据
    this.anomalyHistory = new Map(); // 异常历史记录
  }

  // 记录API性能指标
  recordApiMetric(apiEndpoint, responseTime, success) {
    const now = Date.now();
    const history = this.apiPerformanceBaseline.get(apiEndpoint) || {
      values: [],
      timestamps: [],
      mean: 0,
      stdDev: 0
    };

    // 保留最近1000个数据点
    history.values.push(responseTime);
    history.timestamps.push(now);
    if (history.values.length > 1000) {
      history.values.shift();
      history.timestamps.shift();
    }

    // 动态计算均值和标准差
    this.calculateStatistics(history);

    this.apiPerformanceBaseline.set(apiEndpoint, history);

    // 检测异常
    if (history.values.length > 30) { // 有足够数据后开始检测
      this.detectAnomaly(apiEndpoint, responseTime, history);
    }
  }

  calculateStatistics(history) {
    const values = history.values;
    const mean = values.reduce((sum, val) => sum + val, 0) / values.length;
    const variance = values.reduce((sum, val) => sum + Math.pow(val - mean, 2), 0) / values.length;
    history.mean = mean;
    history.stdDev = Math.sqrt(variance);
  }

  detectAnomaly(apiEndpoint, currentValue, history) {
    // 使用3-sigma原则检测异常
    const threshold = history.mean + 3 * history.stdDev;
    
    if (currentValue > threshold) {
      const anomaly = {
        apiEndpoint,
        currentValue,
        baselineMean: history.mean,
        baselineStdDev: history.stdDev,
        timestamp: new Date().toISOString(),
        severity: currentValue > history.mean + 5 * history.stdDev ? 'high' : 'medium'
      };

      // 检查是否连续异常,避免频繁告警
      const recentAnomalies = this.anomalyHistory.get(apiEndpoint) || [];
      const recentTimeWindow = Date.now() - 5 * 60 * 1000; // 5分钟
      const recentCount = recentAnomalies.filter(a => 
        Date.parse(a.timestamp) > recentTimeWindow
      ).length;

      if (recentCount < 3) { // 5分钟内不超过3次异常才告警
        this.triggerAlert(anomaly);
        recentAnomalies.push(anomaly);
        this.anomalyHistory.set(apiEndpoint, recentAnomalies.slice(-20)); // 保留最近20条
      }
    }
  }

  triggerAlert(anomaly) {
    // 集成到告警平台
    console.warn(`API性能异常告警: ${anomaly.apiEndpoint}`, anomaly);
    // 实际中会发送到Slack、钉钉、邮件等
  }
}

// 使用示例
const alertSystem = new SmartAlertSystem();

// 模拟API调用监控
function monitorApiCall(endpoint, responseTime) {
  alertSystem.recordApiMetric(endpoint, responseTime, true);
}

// 模拟数据
setInterval(() => {
  const baseTime = Math.random() * 100 + 50; // 正常响应时间50-150ms
  const anomaly = Math.random() < 0.05 ? 500 + Math.random() * 1000 : 0; // 5%概率出现异常
  monitorApiCall('/api/user/data', baseTime + anomaly);
}, 1000);

更高级的告警策略可以结合机器学习模型,识别更复杂的异常模式,如周期性变化、趋势性上涨等。

自愈机制的实现路径

自愈机制的核心是预设修复策略并在满足条件时自动执行。前端自愈通常围绕资源加载、状态恢复和降级策略展开。

资源加载失败的自愈

对于CSS、JS等静态资源加载失败,可以实现自动重试或切换备用CDN:

javascript 复制代码
class ResourceSelfHealing {
  constructor() {
    this.failedResources = new Set();
    this.maxRetries = 2;
    this.cdnFallbacks = [
      'https://cdn1.example.com',
      'https://cdn2.example.com',
      'https://cdn3.example.com'
    ];
  }

  init() {
    // 监听资源加载错误
    window.addEventListener('error', (event) => {
      const target = event.target;
      if (target.tagName === 'LINK' || target.tagName === 'SCRIPT') {
        this.handleResourceError(target);
      }
    }, true); // 使用捕获阶段
  }

  handleResourceError(element) {
    const src = element.href || element.src;
    if (!src || this.failedResources.has(src)) return;

    this.failedResources.add(src);
    console.log(`检测到资源加载失败: ${src}`);

    // 检查重试次数
    const retryCount = parseInt(element.dataset.retryCount || '0');
    if (retryCount < this.maxRetries) {
      this.retryWithFallback(element, src, retryCount);
    } else {
      this.applyDegradationStrategy(element);
    }
  }

  retryWithFallback(element, originalUrl, retryCount) {
    const currentCdn = this.getCurrentCdn(originalUrl);
    const nextCdnIndex = (this.cdnFallbacks.indexOf(currentCdn) + 1) % this.cdnFallbacks.length;
    const fallbackUrl = originalUrl.replace(currentCdn, this.cdnFallbacks[nextCdnIndex]);

    console.log(`尝试备用CDN: ${fallbackUrl}`);
    
    // 更新元素属性
    if (element.tagName === 'LINK') {
      element.href = fallbackUrl + '?retry=' + (retryCount + 1);
    } else {
      element.src = fallbackUrl + '?retry=' + (retryCount + 1);
    }
    element.dataset.retryCount = retryCount + 1;

    // 重新加载
    if (element.tagName === 'SCRIPT') {
      const newScript = document.createElement('script');
      newScript.src = element.src;
      document.head.appendChild(newScript);
      element.remove();
    }
  }

  getCurrentCdn(url) {
    for (const cdn of this.cdnFallbacks) {
      if (url.startsWith(cdn)) {
        return cdn;
      }
    }
    return this.cdnFallbacks[0];
  }

  applyDegradationStrategy(element) {
    const tagName = element.tagName;
    console.log(`对${tagName}资源应用降级策略`);

    if (tagName === 'LINK' && element.rel === 'stylesheet') {
      // CSS加载失败,应用基础样式
      this.applyBasicStyles();
    } else if (tagName === 'SCRIPT') {
      // JS加载失败,检查是否是关键功能
      if (element.dataset.critical === 'true') {
        this.showDegradationUI();
      }
    }
  }

  applyBasicStyles() {
    const basicCSS = `
      body { font-family: system-ui, sans-serif; line-height: 1.5; }
      .container { max-width: 1200px; margin: 0 auto; padding: 20px; }
      /* 更多基础样式... */
    `;
    const style = document.createElement('style');
    style.textContent = basicCSS;
    document.head.appendChild(style);
  }

  showDegradationUI() {
    const banner = document.createElement('div');
    banner.style.cssText = `
      position: fixed;
      top: 0;
      left: 0;
      right: 0;
      background: #fff3cd;
      border-bottom: 1px solid #ffeaa7;
      padding: 10px 20px;
      text-align: center;
      z-index: 9999;
    `;
    banner.innerHTML = `
      <p>部分功能暂时不可用,我们正在全力修复。基础功能仍可正常使用。</p>
      <button onclick="this.parentElement.style.display='none'">知道了</button>
    `;
    document.body.prepend(banner);
  }
}

// 初始化自愈系统
const selfHealing = new ResourceSelfHealing();
selfHealing.init();

应用状态异常的自愈

对于单页应用(SPA)的状态异常,可以实现状态自动恢复机制:

javascript 复制代码
class StateRecoverySystem {
  constructor() {
    this.stateCheckpoints = [];
    this.maxCheckpoints = 10;
    this.healthCheckInterval = 30000; // 30秒检查一次
    this.init();
  }

  init() {
    // 定期保存状态检查点
    setInterval(() => this.saveCheckpoint(), this.healthCheckInterval);
    
    // 监听应用异常
    this.setupErrorBoundary();
    
    // 页面可见性变化时检查状态
    document.addEventListener('visibilitychange', () => {
      if (!document.hidden) {
        this.verifyStateConsistency();
      }
    });
  }

  saveCheckpoint() {
    const checkpoint = {
      timestamp: Date.now(),
      route: window.location.pathname,
      appState: this.captureAppState(),
      domState: this.captureDomState(),
      performance: this.capturePerformance()
    };

    this.stateCheckpoints.push(checkpoint);
    if (this.stateCheckpoints.length > this.maxCheckpoints) {
      this.stateCheckpoints.shift();
    }

    localStorage.setItem('last_state_checkpoint', JSON.stringify(checkpoint));
  }

  captureAppState() {
    // 根据实际应用状态结构捕获关键状态
    // 例如从状态管理库(Vuex/Pinia, Vue的响应式状态等)中获取
    const state = {
      user: window.__APP_STATE__?.user || null,
      theme: document.documentElement.getAttribute('data-theme') || 'light',
      // 其他关键状态...
    };
    return state;
  }

  captureDomState() {
    // 捕获关键DOM元素的状态
    const criticalElements = Array.from(document.querySelectorAll('[data-critical-state]'));
    return criticalElements.map(el => ({
      id: el.id,
      text: el.textContent?.slice(0, 100),
      value: el.value,
      checked: el.checked
    }));
  }

  capturePerformance() {
    const perf = performance.getEntriesByType('navigation')[0];
    return {
      loadTime: perf?.loadEventEnd || 0,
      domComplete: perf?.domComplete || 0
    };
  }

  setupErrorBoundary() {
    // 全局错误边界
    const originalAddEventListener = window.addEventListener;
    window.addEventListener = function(type, listener, options) {
      const wrappedListener = function(...args) {
        try {
          return listener.apply(this, args);
        } catch (error) {
          console.error('事件处理错误:', error);
          self.recoverFromError(error, { type, args });
          throw error;
        }
      };
      return originalAddEventListener.call(this, type, wrappedListener, options);
    };

    // 重写Promise.then以捕获异步错误
    const originalThen = Promise.prototype.then;
    Promise.prototype.then = function(onFulfilled, onRejected) {
      const wrappedOnFulfilled = onFulfilled && function(value) {
        try {
          return onFulfilled.call(this, value);
        } catch (error) {
          self.recoverFromError(error, { phase: 'promise_fulfilled' });
          throw error;
        }
      };

      const wrappedOnRejected = onRejected || function(error) {
        console.error('未处理的Promise拒绝:', error);
        self.recoverFromError(error, { phase: 'promise_rejected' });
        throw error;
      };

      return originalThen.call(this, wrappedOnFulfilled, wrappedOnRejected);
    };
  }

  recoverFromError(error, context) {
    console.log('尝试从错误中恢复:', error.message, context);

    // 尝试回滚到上一个检查点
    const lastCheckpoint = this.stateCheckpoints[this.stateCheckpoints.length - 2]; // 上一个检查点
    if (lastCheckpoint) {
      this.restoreFromCheckpoint(lastCheckpoint);
    } else {
      // 没有检查点,尝试温和恢复
      this.softRecovery();
    }

    // 上报错误
    this.reportError(error, context);
  }

  restoreFromCheckpoint(checkpoint) {
    console.log('从检查点恢复状态:', checkpoint.timestamp);
    
    // 恢复路由
    if (checkpoint.route !== window.location.pathname) {
      window.history.replaceState(null, '', checkpoint.route);
    }

    // 恢复应用状态
    this.restoreAppState(checkpoint.appState);

    // 恢复DOM状态
    setTimeout(() => {
      this.restoreDomState(checkpoint.domState);
    }, 100);

    // 显示恢复提示
    this.showRecoveryNotification();
  }

  restoreAppState(state) {
    // 根据实际应用实现状态恢复
    if (state.user) {
      window.__APP_STATE__ = window.__APP_STATE__ || {};
      window.__APP_STATE__.user = state.user;
    }
    
    if (state.theme) {
      document.documentElement.setAttribute('data-theme', state.theme);
    }
  }

  restoreDomState(domState) {
    domState.forEach(item => {
      const element = document.getElementById(item.id);
      if (element) {
        if (element.value !== undefined) element.value = item.value;
        if (element.checked !== undefined) element.checked = item.checked;
        if (item.text && !element.value) {
          element.textContent = item.text;
        }
      }
    });
  }

  softRecovery() {
    // 温和恢复策略
    console.log('执行温和恢复策略');
    
    // 1. 清理可能出错的事件监听器
    this.cleanupEventListeners();
    
    // 2. 重置部分应用状态
    this.resetNonCriticalState();
    
    // 3. 重新渲染关键组件
    this.rerenderCriticalComponents();
  }

  cleanupEventListeners() {
    // 清理可能重复或出错的事件监听器
    const elements = document.querySelectorAll('[data-dynamic-listener]');
    elements.forEach(el => {
      el.replaceWith(el.cloneNode(true));
    });
  }

  resetNonCriticalState() {
    // 重置非关键状态
    localStorage.removeItem('temporary_states');
    sessionStorage.removeItem('form_drafts');
  }

  rerenderCriticalComponents() {
    // 重新渲染关键UI组件
    const components = document.querySelectorAll('[data-dynamic-component]');
    components.forEach(component => {
      const componentName = component.dataset.component;
      this.rerenderComponent(componentName, component);
    });
  }

  rerenderComponent(name, container) {
    // 模拟组件重新渲染
    console.log(`重新渲染组件: ${name}`);
    container.innerHTML = `<div>正在恢复${name}组件...</div>`;
    
    // 实际应用中会调用组件渲染函数
    setTimeout(() => {
      container.innerHTML = `<div>${name}组件已恢复</div>`;
    }, 500);
  }

  verifyStateConsistency() {
    // 验证状态一致性
    const lastCheckpoint = JSON.parse(localStorage.getItem('last_state_checkpoint'));
    if (!lastCheckpoint) return;

    const currentState = this.captureAppState();
    const inconsistencies = this.findStateInconsistencies(lastCheckpoint.appState, currentState);

    if (inconsistencies.length > 0) {
      console.warn('检测到状态不一致:', inconsistencies);
      this.reconcileState(inconsistencies);
    }
  }

  findStateInconsistencies(previous, current) {
    const inconsistencies = [];
    
    for (const key in previous) {
      if (JSON.stringify(previous[key]) !== JSON.stringify(current[key