cloudwatch+lambda 实现自动重启

使用ec2搭建的springboot的服务,偶尔会碰到一些问题,导致service挂掉。为了保证服务的可持续性,打算使用cloudwatch+lambda自动监视,实现service的自动重启。

cloudwatch,主要用来监视,报警,可以设置为5分钟内healthcheck ng的情况下,报警到sns。
然后sns的通知,又可以是lambda的trigger。这样就可以让lambda去重启server了。

# instanceのhealth checkを行い、サービスが死んだ場合、インスタンス再起動する
import boto3
import json
import logging
import os

from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from botocore.config import Config

SLACK_CHANNEL = os.environ['slackChannel']

HOOK_URL = os.environ['slackHookUrl']

INSTANCES = [
    {
        "id": "i-xxxxx",
        "ip": "172.1.1.1",
        "path": "/index.html",
        "group": "groupA",
        "name": "#1号機"
    }
]

TOPIC_ARN = 'arn:aws:sns:ap-northeast-1:xxx:xxx'

logger = logging.getLogger()
logger.setLevel(logging.INFO)

def lambda_handler(event, context):

    for instance in INSTANCES:
        try:
            confirm_response = urlopen("http://" + instance['ip'] + instance['path'], timeout=5)
            logger.info("Response: " + str(confirm_response.getcode()))
        except HTTPError as e:
            logger.info("Request failed(id: %s): %d %s", instance['id'], e.code, e.reason)
            restart_instance(instance)
        except URLError as e:
            logger.info("URLError(ip: %s)", instance['ip'])
            restart_instance(instance)

    logger.info("Event: " + str(event))

    return {'result': 'ok'}

def restart_instance(instance):


    client = boto3.client('ec2')
    resource = boto3.resource('ec2').Instance(instance['id'])

    response = client.describe_instances(InstanceIds=[instance['id']])
    instance_info = response['Reservations'][0]['Instances']
    status = instance_info[0]['State']['Name']
    logger.info("instance_status: %s", str(status))

    if status != 'stopped':
        stop_result = client.stop_instances(InstanceIds=[instance['id']])
        resource.wait_until_stopped()
        logger.info("stop result:" + str(stop_result))

    start_result = client.start_instances(InstanceIds=[instance['id']])
    resource.wait_until_running()
    logger.info("start result:" + str(start_result))

    # 復旧通知
    notify_slack(instance)
    notify_restarted(instance)

def notify_slack(instance):
    slack_message = {
        'channel': SLACK_CHANNEL,
        'text': "instance(id: %s, ip: %s) restarted" % (instance['id'], instance['ip'])
    }
    req = Request(HOOK_URL, json.dumps(slack_message).encode('utf-8'))
    try:
        response = urlopen(req)
        response.read()
        logger.info("Message posted to %s", slack_message['channel'])
    except HTTPError as e:
        logger.error("Request failed: %d %s", e.code, e.reason)
    except URLError as e:
        logger.error("Server connection failed: %s", e.reason)

def notify_restarted(instance):
    request = {
        'TopicArn': TOPIC_ARN,
        'Message': '対象のインスタンス(Name: %s, Id: %s)の再起動が完了しました。' % (instance['name'], instance['id']),
        'Subject': '【%s】サーバー復旧報告' %(instance['group'])
    }
    client = boto3.client('sns')
    response = client.publish(**request)
    logger.info(str(response))

Related Posts

Leave a Reply

Your email address will not be published. Required fields are marked *

Close Bitnami banner
Bitnami