使用ec2搭建的springboot的服务,偶尔会碰到一些问题,导致service挂掉。为了保证服务的可持续性,打算使用cloudwatch+lambda自动监视,实现service的自动重启。
cloudwatch,主要用来监视,报警,可以设置为5分钟内healthcheck ng的情况下,报警到sns。
然后sns的通知,又可以是lambda的trigger。这样就可以让lambda去重启server了。
# instanceのhealth checkを行い、サービスが死んだ場合、インスタンス再起動する
import boto3
import json
import logging
import os
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
from botocore.config import Config
SLACK_CHANNEL = os.environ['slackChannel']
HOOK_URL = os.environ['slackHookUrl']
INSTANCES = [
{
"id": "i-xxxxx",
"ip": "172.1.1.1",
"path": "/index.html",
"group": "groupA",
"name": "#1号機"
}
]
TOPIC_ARN = 'arn:aws:sns:ap-northeast-1:xxx:xxx'
logger = logging.getLogger()
logger.setLevel(logging.INFO)
def lambda_handler(event, context):
for instance in INSTANCES:
try:
confirm_response = urlopen("http://" + instance['ip'] + instance['path'], timeout=5)
logger.info("Response: " + str(confirm_response.getcode()))
except HTTPError as e:
logger.info("Request failed(id: %s): %d %s", instance['id'], e.code, e.reason)
restart_instance(instance)
except URLError as e:
logger.info("URLError(ip: %s)", instance['ip'])
restart_instance(instance)
logger.info("Event: " + str(event))
return {'result': 'ok'}
def restart_instance(instance):
client = boto3.client('ec2')
resource = boto3.resource('ec2').Instance(instance['id'])
response = client.describe_instances(InstanceIds=[instance['id']])
instance_info = response['Reservations'][0]['Instances']
status = instance_info[0]['State']['Name']
logger.info("instance_status: %s", str(status))
if status != 'stopped':
stop_result = client.stop_instances(InstanceIds=[instance['id']])
resource.wait_until_stopped()
logger.info("stop result:" + str(stop_result))
start_result = client.start_instances(InstanceIds=[instance['id']])
resource.wait_until_running()
logger.info("start result:" + str(start_result))
# 復旧通知
notify_slack(instance)
notify_restarted(instance)
def notify_slack(instance):
slack_message = {
'channel': SLACK_CHANNEL,
'text': "instance(id: %s, ip: %s) restarted" % (instance['id'], instance['ip'])
}
req = Request(HOOK_URL, json.dumps(slack_message).encode('utf-8'))
try:
response = urlopen(req)
response.read()
logger.info("Message posted to %s", slack_message['channel'])
except HTTPError as e:
logger.error("Request failed: %d %s", e.code, e.reason)
except URLError as e:
logger.error("Server connection failed: %s", e.reason)
def notify_restarted(instance):
request = {
'TopicArn': TOPIC_ARN,
'Message': '対象のインスタンス(Name: %s, Id: %s)の再起動が完了しました。' % (instance['name'], instance['id']),
'Subject': '【%s】サーバー復旧報告' %(instance['group'])
}
client = boto3.client('sns')
response = client.publish(**request)
logger.info(str(response))