Jump to: navigation, search

Difference between revisions of "StarlingX/Containers/Applications/app-intel-device-plugins"

(Testing GPU device plugin)
(Testing dsa device plugin)
 
Line 21: Line 21:
 
  - Upload and apply intel-device-plugins-operator app
 
  - Upload and apply intel-device-plugins-operator app
  
=== Testing dsa device plugin ===
+
=== Testing DSA device plugin ===
 
- Enable DSA device plugin helm chart:
 
- Enable DSA device plugin helm chart:
  
Line 27: Line 27:
  
 
Apply intel-device-plugins-operator again
 
Apply intel-device-plugins-operator again
Confirm that dsa resources are available:
+
Confirm that DSA resources are available:
  
 
  [sysadmin@controller-0 ~(keystone_admin)]$ kubectl get nodes -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{range $k,$v:=.status.allocatable}}{{"  "}}{{$k}}{{": "}}{{$v}}{{"\n"}}{{end}}{{end}}' | grep '^\([^ ]\)\|\(  dsa\)'
 
  [sysadmin@controller-0 ~(keystone_admin)]$ kubectl get nodes -o go-template='{{range .items}}{{.metadata.name}}{{"\n"}}{{range $k,$v:=.status.allocatable}}{{"  "}}{{$k}}{{": "}}{{$v}}{{"\n"}}{{end}}{{end}}' | grep '^\([^ ]\)\|\(  dsa\)'
Line 110: Line 110:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX0",
+
                 "name":"dpdk_appX0",
 +
                "driver_name":"user",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }
Line 135: Line 136:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX1",
+
                 "name":"dpdk_appX1",
 +
                "driver_name":"user",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }
Line 160: Line 162:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX2",
+
                 "name":"dpdk_appX2",
 +
                "driver_name":"user",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }
Line 185: Line 188:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX3",
+
                 "name":"dpdk_appX3",
 +
                "driver_name":"user",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }
Line 219: Line 223:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX0",
+
                 "name":"dpdk_appX0",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }
Line 244: Line 248:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX1",
+
                 "name":"dpdk_appX1",
 +
                "driver_name":"user",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }
Line 269: Line 274:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX2",
+
                 "name":"dpdk_appX2",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }
Line 294: Line 299:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX3",
+
                 "name":"dpdk_appX3",
 +
                "driver_name":"user",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }
Line 336: Line 342:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX0",
+
                 "name":"dpdk_appX0",
 +
                "driver_name":"user",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }
Line 361: Line 368:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX1",
+
                 "name":"dpdk_appX1",
 +
                "driver_name":"user",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }
Line 386: Line 394:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX2",
+
                 "name":"dpdk_appX2",
 +
                "driver_name":"user",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }
Line 411: Line 420:
 
                 "block_on_fault":1,
 
                 "block_on_fault":1,
 
                 "type":"user",
 
                 "type":"user",
                 "name":"appX3",
+
                 "name":"dpdk_appX3",
 +
                "driver_name":"user",
 
                 "threshold":15
 
                 "threshold":15
 
               }
 
               }

Latest revision as of 15:02, 14 August 2025

Application: app-intel-device-plugins

Source

Building

  • From the Debian Build environment:

Build dependent packages:

build-pkgs -p helm build-info

Build helm chart packages and python plugin:

build-pkgs -p python3-k8sapp-intel-device-plugins-operator intel-device-plugins-dsa-helm intel-device-plugins-gpu-helm intel-device-plugins-qat-helm intel-device-plugins-operator-helm intel-device-plugins-secret-observer-helm

Build final helm application:

build-pkgs -p stx-intel-device-plugins-operator-helm

Testing

- Upload and apply node-feature-discovery app
- Upload and apply intel-device-plugins-operator app

Testing DSA device plugin

- Enable DSA device plugin helm chart:

system helm-chart-attribute-modify --enabled true intel-device-plugins-operator intel-device-plugins-dsa intel-device-plugins-operator 

Apply intel-device-plugins-operator again Confirm that DSA resources are available:

[sysadmin@controller-0 ~(keystone_admin)]$ kubectl get nodes -o go-template='Template:Range .itemsTemplate:.metadata.nameTemplate:"\n"Template:Range $k,$v:=.status.allocatableTemplate:" "Template:$kTemplate:": "Template:$vTemplate:"\n"Template:EndTemplate:End' | grep '^\([^ ]\)\|\(  dsa\)'
controller-0
dsa.intel.com/wq-user-shared: 40

The plugin can be tested by deploying a pod using the VRAN tools image:

apiVersion  v1
 kind:  Pod
 metadata:
  name: dsa-accel-config-demo
  labels:
   app: dsa-accel-config-demo
 spec:
  containers:
   - name: dsa-accel-config-demo
   image: registry.local:9001/docker.io/starlingx/stx-debian-tools-dev:stx.10.0-v1.0.0
   imagePullPolicy: "Always"
   workingDir: "/usr/libexec/accel-config/test/"
   command:
    - "./dsa_user_test_runner.sh"
   args:
    - "--skip-config"
   resources:
    limits:
     dsa.intel.com/wq-user-shared: 1
    restartPolicy: Never
    imagePullSecrets:
     - name:  default-registry-key

Review the job's log:

$ kubectl logs dsa-accel-config-demo | tail
[debug] PF in sub-task[6], consider as passed
[debug] PF in sub-task[7], consider as passed
[debug] PF in sub-task[8], consider as passed
[debug] PF in sub-task[9], consider as passed
[debug] PF in sub-task[10], consider as passed
[debug] PF in sub-task[11], consider as passed
[debug] PF in sub-task[12], consider as passed
[debug] PF in sub-task[13], consider as passed
[debug] PF in sub-task[14], consider as passed
[debug] PF in sub-task[15], consider as passed

If the pod did not successfully launch, possibly because it could not obtain the DSA resource, it will be stuck in the Pending status:

$ kubectl get pods
NAME                      READY   STATUS    RESTARTS   AGE
dsa-accel-config-demo     0/1     Pending   0          7s
This can be verified by checking the Events of the pod:
$ kubectl describe pod dsa-accel-config-demo | grep -A3 Events:
Events:
  Type     Reason            Age    From               Message
  ----     ------            ----   ----               -------
  Warning  FailedScheduling  2m26s  default-scheduler  0/1 nodes are available: 1 Insufficient dsa.intel.com/wq-user-dedicated, 1 Insufficient dsa.intel.com/wq-user-shared.


Customize the configuration

The default configuration uses shared queues for controller-0 node and dedicated queues for the remaining nodes. Node specific configuration can be passed by defining the config name with dsa-<node-name>.conf. The default config is as follow

 dsa.conf: |
   [
     {
       "dev":"dsaX",
       "read_buffer_limit":0,
       "groups":[
         {
           "dev":"groupX.0",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.0",
               "mode":"dedicated",
               "size":16,
               "group_id":0,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX0",
               "driver_name":"user",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.0",
               "group_id":0
             },
           ]
         },
         {
           "dev":"groupX.1",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.1",
               "mode":"dedicated",
               "size":16,
               "group_id":1,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX1",
               "driver_name":"user",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.1",
               "group_id":1
             },
           ]
         },
         {
           "dev":"groupX.2",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.2",
               "mode":"dedicated",
               "size":16,
               "group_id":2,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX2",
               "driver_name":"user",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.2",
               "group_id":2
             },
           ]
         },
         {
           "dev":"groupX.3",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.3",
               "mode":"dedicated",
               "size":16,
               "group_id":3,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX3",
               "driver_name":"user",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.3",
               "group_id":3
             },
           ]
         },
       ]
     }
   ]
 dsa-controller-0.conf: |
   [
     {
       "dev":"dsaX",
       "read_buffer_limit":0,
       "groups":[
         {
           "dev":"groupX.0",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.0",
               "mode":"shared",
               "size":16,
               "group_id":0,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX0",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.0",
               "group_id":0
             },
           ]
         },
         {
           "dev":"groupX.1",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.1",
               "mode":"shared",
               "size":16,
               "group_id":1,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX1",
               "driver_name":"user",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.1",
               "group_id":1
             },
           ]
         },
         {
           "dev":"groupX.2",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.2",
               "mode":"shared",
               "size":16,
               "group_id":2,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX2",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.2",
               "group_id":2
             },
           ]
         },
         {
           "dev":"groupX.3",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.3",
               "mode":"shared",
               "size":16,
               "group_id":3,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX3",
               "driver_name":"user",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.3",
               "group_id":3
             },
           ]
         },
       ]
     }
   ]

which is based on upstream default configuration file: https://github.com/intel/intel-device-plugins-for-kubernetes/blob/main/deployments/dsa_plugin/overlays/dsa_initcontainer/dsa-config.yaml

The DSA device configuration can be customized via application overrides. For instance, the following config uses dedicated queues for all nodes:

overrideConfig:
 dsa.conf: |
   [
     {
       "dev":"dsaX",
       "read_buffer_limit":0,
       "groups":[
         {
           "dev":"groupX.0",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.0",
               "mode":"dedicated",
               "size":16,
               "group_id":0,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX0",
               "driver_name":"user",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.0",
               "group_id":0
             },
           ]
         },
         {
           "dev":"groupX.1",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.1",
               "mode":"dedicated",
               "size":16,
               "group_id":1,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX1",
               "driver_name":"user",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.1",
               "group_id":1
             },
           ]
         },
         {
           "dev":"groupX.2",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.2",
               "mode":"dedicated",
               "size":16,
               "group_id":2,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX2",
               "driver_name":"user",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.2",
               "group_id":2
             },
           ]
         },
         {
           "dev":"groupX.3",
           "read_buffers_reserved":0,
           "use_read_buffer_limit":0,
           "read_buffers_allowed":8,
           "grouped_workqueues":[
             {
               "dev":"wqX.3",
               "mode":"dedicated",
               "size":16,
               "group_id":3,
               "priority":10,
               "block_on_fault":1,
               "type":"user",
               "name":"dpdk_appX3",
               "driver_name":"user",
               "threshold":15
             }
           ],
           "grouped_engines":[
             {
               "dev":"engineX.3",
               "group_id":3
             },
           ]
         },
       ]
     }
   ]

The custom config can be applied with:

$ system helm-override-update intel-device-plugins-operator intel-device-plugins-dsa intel-device-plugins-operator --values <your-override-file>.yaml

Testing QAT device plugin

The host should have Intel QAT hardware. Installation and testing steps are mentioned here. After installation, please verify intel QAT plugin pods are running on each host where application pods can be scheduled to consume QAT resources.

Testing GPU device plugin

The host should have Intel GPU hardware. Installation and testing steps are mentioned here. After installation, please verify intel GPU plugin pods are running on each host where application pods can be scheduled to consume GPU resources.