Meetings/Passthrough/POC-implementation

First of all, I want to indicate that this is a POC implementation. Therefore, it contains a lot of hacks that need to be replaced with real implementation.

Second, this POC implementation is based on the existing PCI Passthrough implemenation. Namely: -- PCI whitelist -- PCI alias -- Nova Flavor

Third, the POC implementation is to demonstrate with the exising PCI Passthrough implementation, SRIOV can be implemented by enhancing the --nic option in the nova boot command. The options will include three new parameters: --pci-alias --sriov-mode (this is named before all of our discussions) --port-profile

Forth, the POC implementation has been tried to be working on Cisco VM-FEX.

The patch is not big. I've added explanation in the key plances in the patch. It's very much in consistence with the descriptions given in the google doc. The only key difference is that PCI group is not introduced in the patch yet.

There were some other minor changes or fixes that I didn't include here, which doesn't help in explaning the patch but adding confusion. Therefore I intentionally left them out.

1. adding the new --nic arguments in the nova client. It's pretty much straightforward

2. Nova server: Interpret the newly added --nic arguments and add them in the requested_networks 3. Nova server: Create request specs based on the requested_networks dictionary. As you can see that nova flaovr with PCI requests can coexist with the PCI requests in the --nic option
 * diff --git a/novaclient/base.py b/novaclient/base.py
 * index d270b31..e2f768c 100644
 * --- a/novaclient/base.py
 * +++ b/novaclient/base.py
 * @@ -393,6 +393,12 @@ class BootingManagerWithFind(ManagerWithFind):
 * net_data['fixed_ip'] = nic_info['v4-fixed-ip']
 * if nic_info.get('port-id'):
 * net_data['port'] = nic_info['port-id']
 * +               if nic_info.get('pci-alias'):
 * +                   net_data['pci_alias'] = nic_info['pci-alias']
 * +               if nic_info.get('sriov'):
 * +                   net_data['sriov'] = nic_info['sriov']
 * +               if nic_info.get('port-profile'):
 * +                   net_data['port_profile'] = nic_info['port-profile']
 * all_net_data.append(net_data)
 * body['server']['networks'] = all_net_data
 * diff --git a/novaclient/v1_1/shell.py b/novaclient/v1_1/shell.py
 * index fbe70db..fd7898f 100644
 * --- a/novaclient/v1_1/shell.py
 * +++ b/novaclient/v1_1/shell.py
 * @@ -229,9 +229,11 @@ def _boot(cs, args, reservation_id=None, min_count=None, max_count=None):
 * for nic_str in args.nics:
 * err_msg = ("Invalid nic argument '%s'. Nic arguments must be of the "
 * "form --nic , with at minimum net-id or port-id "
 * -                  "specified." % nic_str)
 * -       nic_info = {"net-id": "", "v4-fixed-ip": "", "port-id": ""}
 * +                  "port-id=port-uuid,pci-alias=pci-alias-name,"
 * +                  "sriov=sriov-mode,port-profile=port-profile-name>, "
 * +                  "with at minimum net-id or port-id specified." % nic_str)
 * +       nic_info = {"net-id": "", "v4-fixed-ip": "", "port-id": "",
 * +                   "pci-alias": "", "sriov": "", "port-profile": ""}
 * for kv_str in nic_str.split(","):
 * try:
 * @@ -395,7 +397,7 @@ def _boot(cs, args, reservation_id=None, min_count=None, max_count=None):
 * metavar=' ',
 * help="Send arbitrary key/value pairs to the scheduler for custom use.")
 * @utils.arg('--nic',
 * -    metavar="",
 * +    metavar=",pci-alias=pci-alias,sriov=sriov-mode,port-profile=port-profile-name",
 * action='append',
 * dest='nics',
 * default=[],
 * @@ -405,7 +407,11 @@ def _boot(cs, args, reservation_id=None, min_count=None, max_count=None):
 * "(required if no port-id), "
 * "v4-fixed-ip: IPv4 fixed address for NIC (optional), "
 * "port-id: attach NIC to port with this UUID "
 * -          "(required if no net-id)")
 * +          "(required if no net-id), "
 * +          "pci-alias: name of a pci alias (optional), "
 * +          "sriov: sriov mode (macvtap or direct, default to macvtap) (optional), "
 * +          "port-profile: name of a port profile, "
 * +          "(required if pci-alias is present)")
 * @utils.arg('--config-drive',
 * metavar=" ",
 * dest='config_drive',
 * metavar=" ",
 * dest='config_drive',
 * diff --git a/nova/api/openstack/compute/servers.py b/nova/api/openstack/compute/servers.py
 * index 6cb2d9f..334eede 100644
 * --- a/nova/api/openstack/compute/servers.py
 * +++ b/nova/api/openstack/compute/servers.py
 * @@ -670,10 +670,30 @@ class Controller(wsgi.Controller):
 * msg = _("Invalid fixed IP address (%s)") % address
 * raise exc.HTTPBadRequest(explanation=msg)
 * +               pci_alias = network.get('pci_alias', None)
 * +               if (pci_alias is not None and
 * +                   not utils.is_valid_pci_alias(pci_alias)):
 * +                   msg = _("Undefined PCI alias (%s)") % pci_alias
 * +                   raise exc.HTTPBadRequest(explanation=msg)
 * +               sriov_mode = network.get('sriov', None)
 * +               if not sriov_mode:
 * +                   sriov_mode = 'macvtap'
 * +               elif (sriov_mode is not None and
 * +                   not sriov_mode in ('direct', 'macvtap')):
 * +                   msg = _("SRIOV mode is either direct or macvtap")
 * +                   raise exc.HTTPBadRequest(explanation=msg)
 * +               port_profile = network.get('port_profile', None)
 * +               if pci_alias is not None and not port_profile:
 * +                   msg = _("Need port profile for pci alias (%s)") % pci_alias
 * +                   raise exc.HTTPBadRequest(explanation=msg)
 * # For neutronv2, requested_networks
 * # should be tuple of (network_uuid, fixed_ip, port_id)
 * if utils.is_neutron:
 * -                   networks.append((network_uuid, address, port_id))
 * +                   networks.append((network_uuid, address, port_id,
 * +                                    pci_alias, sriov_mode, port_profile))
 * else:
 * # check if the network id is already present in the list,
 * # we don't want duplicate networks to be passed
 * +                                    pci_alias, sriov_mode, port_profile))
 * else:
 * # check if the network id is already present in the list,
 * # we don't want duplicate networks to be passed

4. Nova Scheduler: get the PCI requests from the request spec for scheduling:
 * diff --git a/nova/compute/api.py b/nova/compute/api.py
 * index 3098b07..bb24981 100644
 * --- a/nova/compute/api.py
 * +++ b/nova/compute/api.py
 * @@ -71,6 +71,7 @@ from nova import quota
 * from nova import servicegroup
 * from nova import utils
 * from nova import volume
 * +from nova.pci import pci_request
 * LOG = logging.getLogger(__name__)
 * @@ -704,6 +705,9 @@ class API(base.Base):
 * system_metadata = flavors.save_flavor_info(
 * dict, instance_type)
 * +       pci_request.create_pci_requests_for_network(system_metadata,
 * +                                                   requested_networks)
 * base_options = {
 * 'reservation_id': reservation_id,
 * 'image_ref': image_href,
 * base_options = {
 * 'reservation_id': reservation_id,
 * 'image_ref': image_href,
 * 'image_ref': image_href,
 * diff --git a/nova/scheduler/filter_scheduler.py b/nova/scheduler/filter_scheduler.py
 * index 2b8be5e..07a862e 100644
 * --- a/nova/scheduler/filter_scheduler.py
 * +++ b/nova/scheduler/filter_scheduler.py
 * @@ -209,8 +209,8 @@ class FilterScheduler(driver.Scheduler):
 * os_type = request_spec['instance_properties']['os_type']
 * filter_properties['project_id'] = project_id
 * filter_properties['os_type'] = os_type
 * -       pci_requests = pci_request.get_pci_requests_from_flavor(
 * -           request_spec.get('instance_type') or {})
 * +       pci_requests = pci_request.get_pci_requests_from_request_spec(
 * +                                                              request_spec)
 * if pci_requests:
 * filter_properties['pci_requests'] = pci_requests

5. Nova Scheduler: add the PCI passthrough scheduler. I directly added it in the code for convenience in my test 6. Nova Scheduler: Modified the PCI Passthrough filter so that it won't place non-sriov VMs on a node that supports SRIOV. As you can see that I hacked it based on generic PCI passthrough devices.
 * diff --git a/nova/scheduler/host_manager.py b/nova/scheduler/host_manager.py
 * index e9808e9..35253b1 100644
 * --- a/nova/scheduler/host_manager.py
 * +++ b/nova/scheduler/host_manager.py
 * @@ -48,7 +48,8 @@ host_manager_opts = [
 * 'RamFilter',
 * 'ComputeFilter',
 * 'ComputeCapabilitiesFilter',
 * -                 'ImagePropertiesFilter'
 * +                 'ImagePropertiesFilter',
 * +                 'PciPassthroughFilter'
 * ],
 * help='Which filter class names to use for filtering hosts '
 * 'when not specified in the request.'),

7. Nova Server: code that creates PCI request specs from requested networks, and the code that retrieves them diff --git a/nova/pci/pci_request.py b/nova/pci/pci_request.py
 * diff --git a/nova/scheduler/filters/pci_passthrough_filter.py b/nova/scheduler/filters/pci_passthrough_filter.py
 * index f66795e..68fd2a9 100644
 * --- a/nova/scheduler/filters/pci_passthrough_filter.py
 * +++ b/nova/scheduler/filters/pci_passthrough_filter.py
 * @@ -37,6 +37,6 @@ class PciPassthroughFilter(filters.BaseHostFilter):
 * def host_passes(self, host_state, filter_properties):
 * """Return true if the host has the required PCI devices."""
 * if not filter_properties.get('pci_requests'):
 * -           return True
 * +          return not host_state.pci_stats.support_pci_passthrough
 * return host_state.pci_stats.support_requests(
 * filter_properties.get('pci_requests'))
 * index e25a7c2..fc0dd32 100644
 * --- a/nova/pci/pci_request.py
 * +++ b/nova/pci/pci_request.py
 * @@ -231,3 +231,34 @@ def delete_flavor_pci_info(metadata, *prefixes):
 * to_key = '%spci_requests' % prefix
 * if to_key in metadata:
 * del metadata[to_key]
 * +def create_pci_requests_for_network(metadata, requested_networks, prefix=''):
 * +   """Create pci requests based on requested networks"""
 * +   to_key = '%spci_requests' % prefix
 * +   alias_spec = ''
 * +   for (network_id, fixed_ip, port_id,
 * +        pci_alias, sriov_mode, port_profile) in requested_networks:
 * +       if pci_alias is not None:
 * +           if alias_spec:
 * +               alias_spec += ',%s:1' % pci_alias
 * +           else:
 * +               alias_spec += '%s:1' % pci_alias
 * +   if alias_spec:
 * +       pci_requests = _translate_alias_to_requests(alias_spec)
 * +       if pci_requests:
 * +           metadata[to_key] = jsonutils.dumps(pci_requests)
 * +def get_pci_requests_from_request_spec(request_spec, prefix=''):
 * +   if 'instance_properties' not in request_spec.keys:
 * +       return []
 * +   if 'system_metadata' not in request_spec['instance_properties'].keys:
 * +       return []
 * +   system_metadata = request_spec['instance_properties']['system_metadata']
 * +   pci_requests = system_metadata.get('%spci_requests' % prefix)
 * +   if not pci_requests:
 * +       return []
 * +   return jsonutils.loads(pci_requests)
 * +   pci_requests = system_metadata.get('%spci_requests' % prefix)
 * +   if not pci_requests:
 * +       return []
 * +   return jsonutils.loads(pci_requests)
 * +   return jsonutils.loads(pci_requests)

8: Nova Compute: device allocation and associate the allocated device with the corresponding NIC (saved in the requested_networks)
 * diff --git a/nova/compute/manager.py b/nova/compute/manager.py
 * index 2ff9843..24bcd49 100644
 * --- a/nova/compute/manager.py
 * +++ b/nova/compute/manager.py
 * @@ -1019,6 +1019,11 @@ class ComputeManager(manager.SchedulerDependentManager):
 * with rt.instance_claim(context, instance, limits):
 * macs = self.driver.macs_for_instance(instance)
 * dhcp_options = self.driver.dhcp_options_for_instance(instance)
 * network_info = self._allocate_network(context, instance,
 * requested_networks, macs, security_groups,
 * dhcp_options)
 * +               index = 0;
 * +               for vif in network_info:
 * +                   vif['requested_network'] = requested_networks[index]
 * +                   index +=1
 * self._instance_update(
 * context, instance['uuid'],
 * self._instance_update(
 * context, instance['uuid'],
 * context, instance['uuid'],

As you can see from above: -- instance_claim allocates the requested PCI devices -- _allocate_network invokes neutron v2 APIs to create the port(s). Or to be exact, each port is corresponding to a 'vif' dictionary. -- the change I added is a hack to associate the PCI devices with the VIF

9: Nova Compute, nova neutronv2 API. I didn't add the code for interaction between nova and neutron. The following change is to make it work with the changed requested_network dictionary.


 * diff --git a/nova/network/neutronv2/api.py b/nova/network/neutronv2/api.py
 * index d625cff..df50c71 100644
 * --- a/nova/network/neutronv2/api.py
 * +++ b/nova/network/neutronv2/api.py
 * @@ -236,7 +236,8 @@ class API(base.Base):
 * fixed_ips = {}
 * net_ids = []
 * if requested_networks:
 * -           for network_id, fixed_ip, port_id in requested_networks:
 * +           for (network_id, fixed_ip, port_id,
 * +                pci_alias, sriov_mode, port_profile) in requested_networks:
 * if port_id:
 * port = neutron.show_port(port_id)['port']
 * if port.get('device_id'):
 * @@ -541,7 +542,8 @@ class API(base.Base):
 * net_ids = []
 * -       for (net_id, _i, port_id) in requested_networks:
 * +       for (net_id, _i, port_id, pci_alias,
 * +            sriov_mode, port_profile) in requested_networks:
 * if port_id:
 * try:
 * port = (neutronv2.get_client(context)
 * try:
 * port = (neutronv2.get_client(context)

But based on 8 in above, the neturon v2 api is invoked after the device allocation. Therefore, it's easy to see that the PCI device information can be included in the port binding passed from nova to neutron. And it's also easy to see that neutron can pass back nova the correct vif_type and other information as required for domain generation.

10. Nova Compute, libvirt driver. This code is completely a hack for the purpose of generating the interface xml in the domain xml. As you can see, if a NIC is associated with a PCI device, the generic hostdev xml is first generated and then later replaced with an 802.1qbh interface xml. The real implementation will have to work with the vif driver to generate the config and interface xml.
 * diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py
 * index aff892f..d3d7ae1 100644
 * --- a/nova/virt/libvirt/driver.py
 * +++ b/nova/virt/libvirt/driver.py
 * @@ -2928,11 +2928,13 @@ class LibvirtDriver(driver.ComputeDriver):
 * guest.add_device(cfg)
 * for vif in network_info:
 * -           cfg = self.vif_driver.get_config(instance,
 * -                                            vif,
 * -                                            image_meta,
 * -                                            inst_type)
 * -           guest.add_device(cfg)
 * +           pci_aliase_idx = 3
 * +           if vif['requested_network'][pci_aliase_idx] is None:
 * +               cfg = self.vif_driver.get_config(instance,
 * +                                                vif,
 * +                                                image_meta,
 * +                                                inst_type)
 * +               guest.add_device(cfg)
 * if CONF.libvirt_type == "qemu" or CONF.libvirt_type == "kvm":
 * # The QEMU 'pty' driver throws away any data if no
 * @@ -3026,6 +3028,56 @@ class LibvirtDriver(driver.ComputeDriver):
 * return guest
 * +   def convert_hostdev_to_interface(self, xml, network_info):
 * +       taglist = xml.split('\n')
 * +       new_taglist = []
 * +       port_profile_idx = 5
 * +       pci_alias_idx = 3
 * +       idx = 0
 * +       end = 0
 * +       start = 0
 * +       addridx = 0
 * +       dev_id = 0
 * +       vif_count = len(network_info)
 * +       for element in taglist:
 * +           tag = element.strip
 * +           if tag.startswith('<hostdev'):
 * +               start = idx
 * +           if start:
 * +               if tag.startswith('<address'):
 * +                  addridx = idx
 * +               if tag.startswith(' '):
 * +                   for vif in xrange(dev_id, vif_count):
 * +                       requested_network = network_info[vif]
 * +                       if requested_network['requested_network'][pci_alias_idx] is not None:
 * +                           dev_id = vif + 1
 * +                           break
 * +                   address = taglist[addridx].strip
 * +                   address = address[1:-2].replace('address', 'type="pci"', 1)
 * +                   pci_interface_xml = """
 * +   
 * +     
 * +     
 * +       
 * +""" % (address, requested_network['address'], requested_network['requested_network'][port_profile_idx])
 * +                   if_tag_list = pci_interface_xml.split('/n')
 * +                   new_taglist.append(*if_tag_list)
 * +                   start = 0
 * +           else:
 * +               new_taglist.append(element)
 * +           idx += 1
 * +       return '\n'.join(new_taglist)
 * def to_xml(self, context, instance, network_info, disk_info,
 * image_meta=None, rescue=None,
 * block_device_info=None, write_to_disk=False):
 * @@ -3047,6 +3099,9 @@ class LibvirtDriver(driver.ComputeDriver):
 * disk_info, rescue, block_device_info)
 * xml = conf.to_xml
 * +       if '<hostdev' in xml:
 * +           xml = self.convert_hostdev_to_interface(xml, network_info)
 * if write_to_disk:
 * instance_dir = libvirt_utils.get_instance_path(instance)
 * xml_path = os.path.join(instance_dir, 'libvirt.xml')
 * @@ -3054,6 +3109,7 @@ class LibvirtDriver(driver.ComputeDriver):
 * LOG.debug(_('End to_xml instance=%(instance)s xml=%(xml)s'),
 * {'instance': instance, 'xml': xml})
 * return xml
 * def _lookup_by_id(self, instance_id):
 * @@ -3047,6 +3099,9 @@ class LibvirtDriver(driver.ComputeDriver):
 * disk_info, rescue, block_device_info)
 * xml = conf.to_xml
 * +       if '<hostdev' in xml:
 * +           xml = self.convert_hostdev_to_interface(xml, network_info)
 * if write_to_disk:
 * instance_dir = libvirt_utils.get_instance_path(instance)
 * xml_path = os.path.join(instance_dir, 'libvirt.xml')
 * @@ -3054,6 +3109,7 @@ class LibvirtDriver(driver.ComputeDriver):
 * LOG.debug(_('End to_xml instance=%(instance)s xml=%(xml)s'),
 * {'instance': instance, 'xml': xml})
 * return xml
 * def _lookup_by_id(self, instance_id):
 * {'instance': instance, 'xml': xml})
 * return xml
 * def _lookup_by_id(self, instance_id):
 * def _lookup_by_id(self, instance_id):
 * def _lookup_by_id(self, instance_id):