Jump to: navigation, search

Difference between revisions of "Development/swift/slogging cache log module"

 
Line 1: Line 1:
 
__NOTOC__
 
__NOTOC__
 +
{| border="1" cellpadding="2" cellspacing="0"
 +
|}
 +
 
import collections
 
import collections
 
from urllib import unquote
 
from urllib import unquote
Line 197: Line 200:
 
                     keylist_mapping['cache_'+verb+'_'+code+'_'+hit] = 'cache_'+verb+'_'+code+'_'+hit
 
                     keylist_mapping['cache_'+verb+'_'+code+'_'+hit] = 'cache_'+verb+'_'+code+'_'+hit
 
         return keylist_mapping
 
         return keylist_mapping
 +
 +
{| border="1" cellpadding="2" cellspacing="0"

Revision as of 02:25, 23 February 2012

import collections from urllib import unquote import copy import time

from swift.common.utils import split_path, get_logger

month_map = '_ Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec'.split() LISTING_PARAMS = set('path limit format delimiter marker end_marker prefix'.split())

class CacheLogProcessor(object):

   """Transform proxy server cache logs"""
   def init(self, conf):
       self.server_name = conf.get('server_name', 'proxy-server')
       self.lb_private_ips = [x.strip() for x in \
                              conf.get('lb_private_ips', ).split(',')\
                              if x.strip()]
       self.service_ips = [x.strip() for x in \
                           conf.get('service_ips', ).split(',')\
                           if x.strip()]
       self.warn_percent = float(conf.get('warn_percent', '0.8'))
       self.logger = get_logger(conf, log_route='cache-processor')
       self.logger.info(_('init cache log processor'))
   def log_line_parser(self, raw_log):
       self.logger.info(_('cache process one line log'))
       given a raw access log line, return a dict of the good parts
       d = {}
       try:
           (client_ip,
           client_user,
           timestamp,
           request,
           upstream_status,
           status,
           byte_send,
           host,
           uri,
           referrer,
           user_agent,
           cache_status) = raw_log.split(' - ')
       except ValueError:
           self.logger.info(_('bad line 1'))
           self.logger.debug(_('Bad line data: %s') % repr(raw_log))
           return {}
       account = None
       container_name = None
       storage = None
       brighthost = None
       com = None
       try:
           (account, container_name, storage, brighthost, com) = host.split('.')
       except ValueError, e:
           self.logger.info(_('bad line 2'))
           self.logger.info(_('bad line 2 %s') % host)
           self.logger.debug(_('Invalid host: %(error)s from data: %(log)s') %
           {'error': e, 'log': repr(raw_log)})
           return {}
       if storage != "storage" or brighthost != "brighthost" or com != "com":
           self.logger.info(_('bad line 3 %s %s %s') % storage, brighthost, com)
           return {}
       object_name = None
       if uri is not None:
           if uri == '/':
               object_name = 'index.html'
           else:
               object_name = uri.split('/')[1]
       else:
           self.logger.info(_('bad line 4'))
           self.logger.debug(_('Invalid uri: %(error)s from data: %(log)s') %
           {'error': e, 'log': repr(raw_log)})
           return {}
       method = None
       if request is not None:
           method = request.split(' ')[0].upper()
       else:
           self.logger.info(_('bad line 5'))
           self.logger.debug(_('Invalid method: %(error)s from data: %(log)s') %
           {'error': e, 'log': repr(raw_log)})
           return {}
       if cache_status is None or cache_status is "" or cache_status is "-":
           cache_status = 'NULL'
       d['client_ip'] = client_ip
       d['method'] = method
       d['request'] = request
       d['code'] = int(status)
       d['referrer'] = referrer
       d['user_agent'] = user_agent
       d['byte_send'] = int(byte_send)
       d['cache_status'] = cache_status
       #change localtime to UTC time

self.logger.info(_('OLD time: %s') % timestamp)

       timestamp = timestamp.replace(':', '/')
       timestamp = timestamp.split(' ')[0]
       day, month, year, hour, minute, second = timestamp.split('/')
       month = ('%02s' % month_map.index(month)).replace(' ', '0')
       #timestamp = day+"/"+month+"/"+year+":"+hour+":"+minute+":"+second
       #timestamp = time.strftime("%d/%m/%Y/%H/%M/%S", time.gmtime(time.mktime(time.strptime(timestamp, "%d/%m/%Y:%H:%M:%S"))))

#self.logger.info(_('UTC time: %s') % timestamp)

       #day, month, year, hour, minute, second = timestamp.split('/')
       d['day'] = day
       d['month'] = month
       d['year'] = year
       d['hour'] = hour
       d['minute'] = minute
       d['second'] = second
       d['tz'] = '+0000'
       d['account'] = "AUTH_"+account
       d['container_name'] = container_name
       d['object_name'] = object_name
       d['cache_status'] = cache_status

self.logger.info(_('finish one line'))

       return d
   def process(self, obj_stream, data_object_account, data_object_container, data_object_name):
       generate hourly groupings of data from one access log file
       hourly_aggr_info = {}
       total_lines = 0
       bad_lines = 0
       for line in obj_stream:
           line_data = self.log_line_parser(line)
           total_lines += 1
           if not line_data:
               bad_lines += 1
               continue
           account = line_data['account']
           container_name = line_data['container_name']
           year = line_data['year']
           month = line_data['month']
           day = line_data['day']
           hour = line_data['hour']
           byte_send = int(line_data['byte_send'])
           method = line_data['method']
           code = int(line_data['code'])
           object_name = line_data['object_name']
           client_ip = line_data['client_ip']
           cache_status = line_data['cache_status']
           aggr_key = (account, year, month, day, hour)
           d = hourly_aggr_info.get(aggr_key, {})
           d['cache_byte_send'] = d.setdefault(('cache_byte_send'), 0) + byte_send

self.logger.info(_('cache byte send: %d') % d['cache_byte_send'])

           code = '%dxx' % (code / 100)
           key = 'cache_'+method+'_'+code+'_'+cache_status
           d[key] = d.setdefault(key, 0) + 1
           hourly_aggr_info[aggr_key] = d
       #if bad_lines > (total_lines * self.warn_percent):
       #    name = '/'.join([data_object_account, data_object_container,
       #                     data_object_name])
       #    self.logger.warning(_('I found a bunch of bad lines in %(name)s '\
       #            '(%(bad)d bad, %(total)d total)') %
       #            {'name': name, 'bad': bad_lines, 'total': total_lines})
       return hourly_aggr_info
   def keylist_mapping(self):
       verb_keys = 'GET HEAD'.split()
       code_keys = '2xx 3xx 4xx 5xx'.split()
       hit_keys = 'MISS HIT EXPIRED UPDATING STALE NULL'.split()
       keylist_mapping = {}
       keylist_mapping['cache_byte_send'] = 'cache_byte_send'
       for verb in verb_keys:
           for code in code_keys:
               for hit in hit_keys:
                   keylist_mapping['cache_'+verb+'_'+code+'_'+hit] = 'cache_'+verb+'_'+code+'_'+hit
       return keylist_mapping